Merge branch 'main' into tasklist

# Conflicts: # pyproject.toml

Merge branch 'main' into tasklist
# Conflicts: # pyproject.toml
b58e5556 · Baber · 6e1866f5 · 4f8195f1 · b58e5556 · b58e5556
Commit b58e5556 authored Jul 27, 2025 by Baber
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -35,7 +35,7 @@ repos:
      - id: ruff
        args:
          - --fix
-        # Run the formatter.
+          # Run the formatter.
      - id: ruff-format
  - repo: https://github.com/codespell-project/codespell
    rev: v2.4.1
@@ -43,8 +43,10 @@ repos:
      - id: codespell
        exclude: >
          (?x)^(
+
              .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb
          )$
+
        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
  - repo: https://github.com/jackdewinter/pymarkdown
    rev: v0.9.30
@@ -52,9 +54,3 @@ repos:
      - id: pymarkdown
        exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$
        args: [fix, -r]
-#  - repo: https://github.com/pre-commit/mirrors-mypy
-#    rev: v1.5.1
-#    hooks:
-#    - id: mypy
-#      additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
-#      exclude: ^tests/.*$
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 ---

 ## Latest News 📣
-
+- [2025/07] Added `think_end_token` arg to `hf` (token/str), `vllm` and `sglang` (str) for stripping CoT reasoning traces from models that support it.
 - [2025/03] Added support for steering HF models!
 - [2025/02] Added [SGLang](https://docs.sglang.ai/) support!
 - [2024/09] We are prototyping allowing users of LM Evaluation Harness to create and evaluate on text+image multimodal input, text output tasks, and have just added the `hf-multimodal` and `vllm-vlm` model types and `mmmu` task as a prototype feature. We welcome users to try out this in-progress feature and stress-test it for themselves, and suggest they check out [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval), a wonderful project originally forking off of the lm-evaluation-harness, for a broader range of multimodal tasks, models, and features.

--- a/docs/API_guide.md
+++ b/docs/API_guide.md
@@ -21,7 +21,11 @@ When subclassing `TemplateAPI`, you need to implement the following methods:
 1. `_create_payload`: Creates the JSON payload for API requests.
 2. `parse_logprobs`: Parses log probabilities from API responses.
 3. `parse_generations`: Parses generated text from API responses.
-4. `headers`: Returns the headers for the API request.
+
+Optional Properties:
+
+4. `header`: Returns the headers for the API request.
+5. `api_key`: Returns the API key for authentication (if required).

 You may also need to override other methods or properties depending on your API's specific requirements.

@@ -97,6 +101,10 @@ When initializing a `TemplateAPI` instance or a subclass, you can provide severa
  - Whether to validate the certificate of the API endpoint (if HTTPS).
  - Default is True.

+- `header` (dict, optional):
+  - Custom headers for API requests.
+  - If not provided, uses `{"Authorization": f"Bearer {self.api_key}"}` by default.
+
 Example usage:

 ```python

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -435,10 +435,15 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        # because it's already been determined based on the prior env var before launching our
        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
        import datasets
+        from packaging.version import parse as vparse

-        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+        if vparse(datasets.__version__) < vparse("4.0.0"):
+            datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True

-        args.model_args = args.model_args + ",trust_remote_code=True"
+        if isinstance(args.model_args, dict):
+            args.model_args["trust_remote_code"] = True
+        else:
+            args.model_args = args.model_args + ",trust_remote_code=True"
    (
        eval_logger.info(f"Selected Tasks: {task_names}")
        if eval_logger.getEffectiveLevel() >= logging.INFO

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -505,7 +505,6 @@ def bootstrap_stderr(
    if not os.getenv("DISABLE_MULTIPROC"):
        import multiprocessing as mp

-        pool = mp.Pool(mp.cpu_count())
        # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
        # equivalent to stderr calculated without Bessel's correction in the stddev.
        # Unfortunately, I haven't been able to figure out what the right correction is
@@ -517,17 +516,16 @@ def bootstrap_stderr(
        from tqdm import tqdm

        print("bootstrapping for stddev:", f.__name__)
-        for bootstrap in tqdm(
-            pool.imap(
-                _bootstrap_internal(f, chunk_size),
-                [(i, xs) for i in range(iters // chunk_size)],
-            ),
-            total=iters // chunk_size,
-        ):
-            # sample w replacement
-            res.extend(bootstrap)
-
-        pool.close()
+        with mp.Pool(mp.cpu_count()) as pool:
+            for bootstrap in tqdm(
+                pool.imap(
+                    _bootstrap_internal(f, chunk_size),
+                    [(i, xs) for i in range(iters // chunk_size)],
+                ),
+                total=iters // chunk_size,
+            ):
+                # sample w replacement
+                res.extend(bootstrap)
    else:
        res = _bootstrap_internal_no_mp(f, xs, iters)


--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -3,18 +3,15 @@ import ast
 import logging
 import random
 import re
-from collections.abc import Callable
+from collections.abc import Callable, Iterable, Iterator, Mapping
 from copy import deepcopy
 from dataclasses import asdict, dataclass
 from inspect import getsource
 from typing import (
    Any,
    Dict,
-    Iterable,
-    Iterator,
    List,
    Literal,
-    Mapping,
    Optional,
    Tuple,
    Union,
@@ -113,7 +110,7 @@ class TaskConfig(dict):

            if "until" not in self.generation_kwargs:
                eval_logger.warning(
-                    f"{self.task}: No `until` specified in `generation_kwargs`! Defaulting to the fewshot_delimiter={repr(self.fewshot_delimiter)}"
+                    f"{self.task}: No `until` specified in `generation_kwargs`! Defaulting to the fewshot_delimiter={self.fewshot_delimiter!r}"
                )
                self.generation_kwargs["until"] = [self.fewshot_delimiter]
        else:
@@ -289,17 +286,14 @@ class Task(abc.ABC):
    @abc.abstractmethod
    def has_training_docs(self):
        """Whether the task has a training set"""
-        pass

    @abc.abstractmethod
    def has_validation_docs(self):
        """Whether the task has a validation set"""
-        pass

    @abc.abstractmethod
    def has_test_docs(self):
        """Whether the task has a test set"""
-        pass

    def training_docs(self) -> Iterable:
        """
@@ -518,7 +512,6 @@ class Task(abc.ABC):
            The number of times each instance in a dataset is inferred on. Defaults to 1,
            can be increased for techniques like majority voting.
        """
-        pass

    @abc.abstractmethod
    def process_results(self, doc, results):
@@ -531,7 +524,6 @@ class Task(abc.ABC):
        :param results:
            The results of the requests created in construct_requests.
        """
-        pass

    @abc.abstractmethod
    def aggregation(self):
@@ -540,7 +532,6 @@ class Task(abc.ABC):
            A dictionary where keys are the names of submetrics and values are
            functions that aggregate a list of metric scores
        """
-        pass

    @abc.abstractmethod
    def higher_is_better(self):
@@ -549,7 +540,6 @@ class Task(abc.ABC):
            A dictionary where keys are the names of submetrics and values are
            whether a higher value of the submetric is better
        """
-        pass

    def get_config(self, key: str) -> Any:
        return getattr(self._config, key, None)
@@ -675,8 +665,8 @@ class Task(abc.ABC):
            self.aggregation = lambda: {
                metric_name: get_metric_aggregation(metric_name)
            }
-        setattr(self._config, "metric_list", [{"metric": metric_name}])
-        setattr(self._config, "process_results", None)
+        self._config.metric_list = [{"metric": metric_name}]
+        self._config.process_results = None

    def set_fewshot_seed(self, seed: Optional[int] = None) -> None:
        self.fewshot_rnd = random.Random(seed)
@@ -835,7 +825,7 @@ class ConfigurableTask(Task):
                    agg_name = metric_config["aggregation"]
                    if isinstance(agg_name, str):
                        self._aggregation_list[metric_name] = get_aggregation(agg_name)
-                    elif callable(agg_name):  # noqa: E721
+                    elif callable(agg_name):
                        self._aggregation_list[metric_name] = metric_config[
                            "aggregation"
                        ]
@@ -980,6 +970,10 @@ class ConfigurableTask(Task):
    def download(
        self, dataset_kwargs: Optional[Dict[str, Any]] = None, **kwargs
    ) -> None:
+        from packaging.version import parse as vparse
+
+        if dataset_kwargs and vparse(datasets.__version__) >= vparse("4.0.0"):
+            dataset_kwargs.pop("trust_remote_code", None)
        if isinstance(self.config.custom_dataset, Callable):
            eval_logger.warning(
                f"{self.config.task}: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager."
@@ -1498,7 +1492,7 @@ class ConfigurableTask(Task):
        ):  # TODO: ensure that non-multimodal tasks aren't getting visual args
            multimodal_arg = {
                **multimodal_arg,
-                **{"visual": self.doc_to_image(doc)},
+                "visual": self.doc_to_image(doc),
            }

        if (
@@ -1506,7 +1500,7 @@ class ConfigurableTask(Task):
        ):  # TODO: ensure that non-multimodal tasks aren't getting audio args
            multimodal_arg = {
                **multimodal_arg,
-                **{"audio": self.doc_to_audio(doc)},
+                "audio": self.doc_to_audio(doc),
            }

        if bool(multimodal_arg):
@@ -1769,7 +1763,7 @@ class MultipleChoiceTask(Task):
            Instance(
                request_type="loglikelihood",
                doc=doc,
-                arguments=(ctx, " {}".format(choice)),
+                arguments=(ctx, f" {choice}"),
                idx=i,
                **kwargs,
            )

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -35,6 +35,7 @@ from lm_eval.utils import (
    positional_deprecated,
    setup_logging,
    simple_parse_args_string,
+    wrap_text,
 )


@@ -169,8 +170,11 @@ def simple_evaluate(
        )
    ) and not apply_chat_template:
        eval_logger.warning(
-            "Model appears to be an instruct or chat variant but chat template is not applied. "
-            "Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
+            wrap_text(
+                f"""pretrained={model_args.get("pretrained") if isinstance(model_args, dict) else model_args} appears to be an
+                instruct or chat variant but chat template is not applied.
+                Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`).""",
+            )
        )

    if delete_requests_cache:
@@ -234,7 +238,9 @@ def simple_evaluate(

        else:
            eval_logger.info(
-                f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
+                wrap_text(
+                    f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
+                )
            )
            lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
                model_args,

--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
@@ -135,6 +135,7 @@ class TemplateAPI(TemplateLM):
        eos_string: str = None,
        # timeout in seconds
        timeout: int = 300,
+        header: Optional[Dict[str, str]] = None,
        max_images: int = 1,
        **kwargs,
    ) -> None:
@@ -152,6 +153,7 @@ class TemplateAPI(TemplateLM):
        self.model = model or pretrained
        self.base_url = base_url
        self.tokenizer = tokenizer
+        self._header = header
        if not isinstance(batch_size, int) and "auto" in batch_size:
            eval_logger.warning(
                "Automatic batch size is not supported for API models. Defaulting to batch size 1."
@@ -296,7 +298,7 @@ class TemplateAPI(TemplateLM):
    @cached_property
    def header(self) -> dict:
        """Override this property to return the headers for the API request."""
-        return {"Authorization": f"Bearer {self.api_key}"}
+        return self._header or {"Authorization": f"Bearer {self.api_key}"}

    @property
    def tokenizer_name(self) -> str:
@@ -447,6 +449,7 @@ class TemplateAPI(TemplateLM):
    async def amodel_call(
        self,
        session: ClientSession,
+        sem: asyncio.Semaphore,
        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
        *,
        generate: bool = True,
@@ -465,6 +468,7 @@ class TemplateAPI(TemplateLM):
            **kwargs,
        )
        cache_method = "generate_until" if generate else "loglikelihood"
+        acquired = await sem.acquire()
        try:
            async with session.post(
                self.base_url,
@@ -474,7 +478,8 @@ class TemplateAPI(TemplateLM):
                if not response.ok:
                    error_text = await response.text()
                    eval_logger.warning(
-                        f"API request failed with error message: {error_text}. Retrying..."
+                        f"API request failed! Status code: {response.status}, "
+                        f"Response text: {error_text}. Retrying..."
                    )
                # raising exception will retry the request
                response.raise_for_status()
@@ -495,11 +500,12 @@ class TemplateAPI(TemplateLM):
                    self.cache_hook.add_partial(cache_method, cache, res)
            return answers
        # If the retries also fail
-        except RetryError:
-            eval_logger.error(
-                "API request failed after multiple retries. Please check the API status."
-            )
-            return None
+        except BaseException as e:
+            eval_logger.error(f"Exception:{repr(e)}, {outputs}, retrying.")
+            raise e
+        finally:
+            if acquired:
+                sem.release()

    def batch_loglikelihood_requests(
        self, chunks: Iterable[List[LogLikelihoodInputs]]
@@ -535,6 +541,7 @@ class TemplateAPI(TemplateLM):
    ) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
        ctxlens = ctxlens if ctxlens else [None] * len(requests)
        conn = TCPConnector(limit=self._concurrent, ssl=self.verify_certificate)
+        sem = asyncio.Semaphore(self._concurrent)
        async with ClientSession(
            connector=conn, timeout=ClientTimeout(total=self.timeout)
        ) as session:
@@ -542,12 +549,16 @@ class TemplateAPI(TemplateLM):
                stop=stop_after_attempt(self.max_retries),
                wait=wait_exponential(multiplier=0.5, min=1, max=10),
                reraise=True,
+                before_sleep=lambda retry_state: eval_logger.info(
+                    f"Retry attempt {retry_state.attempt_number}"
+                ),
            )(self.amodel_call)
            # Create tasks for each batch of request
            tasks = [
                asyncio.create_task(
                    retry_(
                        session=session,
+                        sem=sem,
                        messages=message,
                        cache_keys=cache_key,
                        generate=generate,

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
+from __future__ import annotations
+
 import copy
 import logging
 import os
+from collections.abc import Iterator, Sequence
 from datetime import timedelta
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Literal

 import jinja2
 import torch
@@ -17,6 +20,7 @@ from accelerate import (
 from accelerate.utils import get_max_memory
 from huggingface_hub import HfApi
 from packaging import version
+from packaging.version import parse as vparse
 from tqdm import tqdm
 from transformers.models.auto.modeling_auto import (
    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@@ -24,7 +28,6 @@ from transformers.models.auto.modeling_auto import (
 )

 from lm_eval import utils
-from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
 from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
@@ -34,20 +37,23 @@ from lm_eval.models.utils import (
    get_dtype,
    handle_stop_sequences,
    pad_and_concat,
+    postprocess_generated_text,
    stop_sequences_criteria,
 )


 if TYPE_CHECKING:
-    from transformers.quantizers import AutoQuantizationConfig
+    from transformers.quantizers.auto import AutoQuantizationConfig
+
+    from lm_eval.api.instance import Instance

 eval_logger = logging.getLogger(__name__)
+TOKENIZER_INFINITY = 1000000000000000019884624838656


 @register_model("hf-auto", "hf", "huggingface")
 class HFLM(TemplateLM):
-    """
-    An abstracted Huggingface model class. Enables usage with both models of
+    """An abstracted Huggingface model class. Enables usage with both models of
    `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.

    Supports data-parallel multi-GPU with HF Accelerate.
@@ -58,42 +64,45 @@ class HFLM(TemplateLM):

    def __init__(
        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
+        pretrained: str | transformers.PreTrainedModel,
        backend: Literal["default", "causal", "seq2seq"] = "default",
        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
-        revision: Optional[str] = "main",
+        revision: str | None = "main",
        subfolder: str = "",
-        tokenizer: Optional[
-            Union[
-                str,
-                transformers.PreTrainedTokenizer,
-                transformers.PreTrainedTokenizerFast,
-            ]
-        ] = None,
-        truncation: Optional[bool] = False,
+        tokenizer: str
+        | transformers.PreTrainedTokenizer
+        | transformers.PreTrainedTokenizerFast
+        | None = None,
+        truncation: bool | None = False,
        logits_cache: bool = True,
-        max_length: Optional[int] = None,
-        device: Optional[str] = "cuda",
-        dtype: Optional[Union[str, torch.dtype]] = "auto",
-        softmax_dtype: Optional[Union[str, torch.dtype]] = None,
-        batch_size: Optional[Union[int, str]] = 1,
-        max_batch_size: Optional[int] = 64,
-        trust_remote_code: Optional[bool] = False,
-        use_fast_tokenizer: Optional[bool] = True,
-        add_bos_token: Optional[bool] = False,
-        prefix_token_id: Optional[int] = None,
+        max_length: int | None = None,
+        device: str | None = "cuda",
+        dtype: str | torch.dtype | None = "auto",
+        softmax_dtype: str | torch.dtype | None = None,
+        mixed_precision_dtype: str | torch.dtype | None = None,
+        batch_size: int | str | None = 1,
+        max_batch_size: int | None = 64,
+        trust_remote_code: bool | None = False,
+        use_fast_tokenizer: bool | None = True,
+        add_bos_token: bool | None = False,
+        prefix_token_id: int | None = None,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
-        parallelize: Optional[bool] = False,
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
-        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
+        parallelize: bool | None = False,
+        max_memory_per_gpu: int | str | None = None,
+        max_cpu_memory: int | str | None = None,
+        offload_folder: str | os.PathLike | None = "./offload",
        # PEFT, delta weights and quantization options
-        peft: Optional[str] = None,
-        delta: Optional[str] = None,
-        autogptq: Optional[Union[bool, str]] = False,
-        gptqmodel: Optional[bool] = False,
-        gguf_file: Optional[str] = None,
+        peft: str | None = None,
+        delta: str | None = None,
+        autogptq: bool | str | None = False,
+        gptqmodel: bool | None = False,
+        gguf_file: str | None = None,
+        # end token for thinking, either the string or int token id.
+        # splits to get response after this token (if provided).
+        think_end_token: str | int | None = None,
+        enable_thinking: bool | None = None,
+        chat_template_args: dict[str, Any] | None = None,
        **kwargs,
    ) -> None:
        super().__init__()
@@ -223,11 +232,21 @@ class HFLM(TemplateLM):
            self.model.eval()
            self.model.tie_weights()

+        self.think_end_token = (
+            int(think_end_token)
+            if (isinstance(think_end_token, str) and think_end_token.isdigit())
+            else think_end_token
+        )
        self.truncation = truncation
        self.logits_cache = logits_cache
        self.vocab_size = self.tokenizer.vocab_size
        # select (or create) a pad token to use
        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
+        self.chat_template_args = (
+            chat_template_args or {} | dict(enable_thinking=enable_thinking)
+            if enable_thinking is not None
+            else {}
+        )

        self.add_bos_token = add_bos_token
        if "gemma" in getattr(self.config, "model_type", ""):
@@ -247,6 +266,11 @@ class HFLM(TemplateLM):
        self.softmax_dtype = (
            get_dtype(softmax_dtype) if softmax_dtype is not None else None
        )
+        self.mixed_precision_dtype = (
+            get_dtype(mixed_precision_dtype)
+            if mixed_precision_dtype is not None
+            else None
+        )

        if str(batch_size).startswith("auto"):
            batch_size = batch_size.split(":")
@@ -256,18 +280,19 @@ class HFLM(TemplateLM):
            self.batch_size_per_gpu = int(batch_size)

        if isinstance(pretrained, str):
-            if gpus >= 1 or str(self.device) == "mps":
+            if (gpus >= 1 or str(self.device) == "mps") and not (
+                parallelize or autogptq or hasattr(self, "accelerator")
+            ):
                # TODO: can remove this whole snippet except in the mps case, perhaps?
-                if not (parallelize or autogptq or hasattr(self, "accelerator")):
-                    # place model onto device requested manually,
-                    # if not using HF Accelerate or device_map
-                    # or any other option that preloads model onto device
-                    try:
-                        self.model.to(self.device)
-                    except ValueError:
-                        eval_logger.debug(
-                            "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
-                        )
+                # place model onto device requested manually,
+                # if not using HF Accelerate or device_map
+                # or any other option that preloads model onto device
+                try:
+                    self.model.to(self.device)
+                except ValueError:
+                    eval_logger.debug(
+                        "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
+                    )
            # multigpu data-parallel support when launched with accelerate
            if gpus > 1:
                if accelerator.num_processes > 1:
@@ -312,12 +337,12 @@ class HFLM(TemplateLM):

    def _get_accelerate_args(
        self,
-        parallelize: Optional[bool] = None,
-        device_map: Optional[str] = "auto",
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
-        offload_folder: Optional[str] = "./offload",
-        gpus: Optional[int] = None,
+        parallelize: bool | None = None,
+        device_map: str | None = "auto",
+        max_memory_per_gpu: int | str | None = None,
+        max_cpu_memory: int | str | None = None,
+        offload_folder: str | None = "./offload",
+        gpus: int | None = None,
    ) -> dict:
        """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
        num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
@@ -355,13 +380,8 @@ class HFLM(TemplateLM):
                }
            else:  # Estimating the possible memory requirements
                max_memory_all_gpus = get_max_memory()
-                if "cpu" in max_memory_all_gpus:
-                    del max_memory_all_gpus["cpu"]
-                if not hasattr(self, "accelerator"):
-                    max_memory_per_gpu_map = {
-                        k: v for k, v in max_memory_all_gpus.items()
-                    }
-                else:
+                max_memory_all_gpus.pop("cpu", None)
+                if hasattr(self, "accelerator"):
                    # use only 1 / num_processes of the GPUs if we are running under accelerate launch
                    max_memory_per_gpu_map = {
                        k: v
@@ -369,6 +389,9 @@ class HFLM(TemplateLM):
                        if k % num_local_processes
                        == (self.accelerator.process_index % num_local_processes)
                    }
+                else:
+                    max_memory_per_gpu_map = max_memory_all_gpus
+
            args["max_memory"] = max_memory_per_gpu_map
            args["device_map"] = "auto" if device_map is None else device_map
            eval_logger.info(
@@ -412,12 +435,12 @@ class HFLM(TemplateLM):
            return self._model

    @property
-    def eot_token_id(self):
+    def eot_token_id(self) -> int:
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id

    @property
-    def prefix_token_id(self):
+    def prefix_token_id(self) -> int:
        # it is used as prefix for loglikelihood
        if self.custom_prefix_token_id is not None:
            return self.custom_prefix_token_id
@@ -426,7 +449,7 @@ class HFLM(TemplateLM):
        return self.tokenizer.eos_token_id

    @property
-    def max_length(self):
+    def max_length(self) -> int:
        if self._max_length:  # if max length manually set, return it
            return self._max_length
        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
@@ -434,7 +457,7 @@ class HFLM(TemplateLM):
            if hasattr(self.model.config, attr):
                return getattr(self.model.config, attr)
        if hasattr(self.tokenizer, "model_max_length"):
-            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+            if self.tokenizer.model_max_length == TOKENIZER_INFINITY:
                return self._DEFAULT_MAX_LENGTH
            return self.tokenizer.model_max_length
        return self._DEFAULT_MAX_LENGTH
@@ -465,12 +488,12 @@ class HFLM(TemplateLM):

    def _get_backend(
        self,
-        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
+        config: transformers.PretrainedConfig | transformers.AutoConfig,
        backend: Literal["default", "causal", "seq2seq"] = "default",
-        trust_remote_code: Optional[bool] = False,
+        trust_remote_code: bool | None = False,
    ) -> None:
-        """
-        Helper method during initialization.
+        """Helper method during initialization.
+
        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
        sets `self.AUTO_MODEL_CLASS` appropriately if not already set.

@@ -482,9 +505,7 @@ class HFLM(TemplateLM):

        if backend != "default":
            # if we've settled on non-default backend, use that manually
-            if backend == "causal":
-                self.backend = backend
-            elif backend == "seq2seq":
+            if backend in ["causal", "seq2seq"]:
                self.backend = backend
            eval_logger.info(
                f"Overrode HF model backend type, and using type '{self.backend}'"
@@ -492,7 +513,7 @@ class HFLM(TemplateLM):
        else:
            # determine and use the default HF backend for this model, based on its config + metadata.
            if (
-                getattr(config, "model_type")
+                getattr(config, "model_type", None)
                in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
            ):
                # first check if model type is listed under seq2seq models, since some
@@ -501,7 +522,7 @@ class HFLM(TemplateLM):
                self.backend = "seq2seq"
                eval_logger.debug(f"Using model type '{self.backend}'")
            elif (
-                getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+                getattr(config, "model_type", None) in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
            ):
                self.backend = "causal"
                eval_logger.debug(f"Using model type '{self.backend}'")
@@ -530,10 +551,10 @@ class HFLM(TemplateLM):
        pretrained: str,
        revision: str = "main",
        trust_remote_code: bool = False,
-        gguf_file: Optional[str] = None,
+        gguf_file: str | None = None,
        subfolder: str = "",
    ) -> None:
-        """Return the model config for HuggingFace models"""
+        """Return the model config for HuggingFace models."""
        self._config = transformers.AutoConfig.from_pretrained(
            pretrained,
            revision=revision,
@@ -545,29 +566,28 @@ class HFLM(TemplateLM):
    def _create_model(
        self,
        pretrained: str,
-        revision: Optional[str] = "main",
-        dtype: Optional[Union[str, torch.dtype]] = "auto",
-        trust_remote_code: Optional[bool] = False,
+        revision: str | None = "main",
+        dtype: str | torch.dtype | None = "auto",
+        trust_remote_code: bool | None = False,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
        # (accelerate naive PP (device_map) options)
-        parallelize: Optional[bool] = False,
-        gpus: Optional[int] = None,
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
-        offload_folder: Optional[str] = "./offload",
+        parallelize: bool | None = False,
+        gpus: int | None = None,
+        max_memory_per_gpu: int | str | None = None,
+        max_cpu_memory: int | str | None = None,
+        offload_folder: str | None = "./offload",
        # PEFT, delta weights and quantization options
-        peft: Optional[str] = None,
-        delta: Optional[str] = None,
-        autogptq: Optional[Union[bool, str]] = False,
-        gptqmodel: Optional[bool] = False,
-        gguf_file: Optional[str] = None,
-        quantization_config: Optional["AutoQuantizationConfig"] = None,
+        peft: str | None = None,
+        delta: str | None = None,
+        autogptq: bool | str | None = False,
+        gptqmodel: bool | None = False,
+        gguf_file: str | None = None,
+        quantization_config: AutoQuantizationConfig | None = None,
        subfolder: str = "",
        **kwargs,
    ) -> None:
-        """
-        Initializes an HF or HF-compatible PreTrainedModel from scratch
+        """Initializes an HF or HF-compatible PreTrainedModel from scratch
        inside HFLM, using the kwargs passed into self.__init__().

        Also handles functionality such as AutoGPTQ usage and PEFT wrapping.
@@ -578,12 +598,12 @@ class HFLM(TemplateLM):
        please consider subclassing HFLM and overriding this and other methods as needed.
        """

-        model_kwargs = kwargs if kwargs else {}
+        model_kwargs = kwargs or {}

        model_kwargs.update(
            self._get_accelerate_args(
                parallelize=parallelize,
-                device_map=kwargs.get("device_map", None),
+                device_map=kwargs.get("device_map"),
                max_memory_per_gpu=max_memory_per_gpu,
                max_cpu_memory=max_cpu_memory,
                offload_folder=offload_folder,
@@ -592,16 +612,12 @@ class HFLM(TemplateLM):
        )

        if not autogptq and not gptqmodel:
-            if model_kwargs.get("load_in_4bit", None):
-                assert transformers.__version__ >= "4.30.0", (
+            if model_kwargs.get("load_in_4bit"):
+                assert vparse(transformers.__version__) >= vparse("4.30.0"), (
                    "load_in_4bit requires transformers >= 4.30.0"
                )
-            if transformers.__version__ >= "4.30.0":
-                if model_kwargs.get("load_in_4bit", None):
-                    if model_kwargs.get("bnb_4bit_compute_dtype", None):
-                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
-                            model_kwargs["bnb_4bit_compute_dtype"]
-                        )
+                if compute_dtype := model_kwargs.get("bnb_4bit_compute_dtype"):
+                    model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(compute_dtype)

            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
                pretrained,
@@ -626,7 +642,7 @@ class HFLM(TemplateLM):
                    raise type(exception)(
                        "Tried to load auto_gptq, but auto-gptq is not installed ",
                        "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
-                    )
+                    ) from exception

                self._model = AutoGPTQForCausalLM.from_quantized(
                    pretrained,
@@ -645,7 +661,7 @@ class HFLM(TemplateLM):
                    raise type(exception)(
                        "Tried to load gptqmodel, but gptqmodel is not installed ",
                        "please install gptqmodel via `pip install gptqmodel --no-build-isolation` or `pip install lm-eval[gptqmodel] --no-build-isolation`",
-                    )
+                    ) from exception

                self._model = GPTQModel.from_quantized(
                    pretrained, trust_remote_code=trust_remote_code, **model_kwargs
@@ -660,9 +676,10 @@ class HFLM(TemplateLM):
            from peft import PeftModel
            from peft import __version__ as PEFT_VERSION

-            if model_kwargs.get("load_in_4bit", None):
-                if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
-                    raise AssertionError("load_in_4bit requires peft >= 0.4.0")
+            if model_kwargs.get("load_in_4bit") and vparse(PEFT_VERSION) < vparse(
+                "0.4.0"
+            ):
+                raise AssertionError("load_in_4bit requires peft >= 0.4.0")
            if self._model.config.vocab_size != len(self.tokenizer):
                # resize model for LoRAs with added tokens
                eval_logger.info(
@@ -687,36 +704,32 @@ class HFLM(TemplateLM):
            for name, param in self._model.state_dict().items():
                try:
                    param.data += _model_delta.state_dict()[name]
-                except KeyError:
-                    raise KeyError(f"Delta model is missing weights for layer: {name}")
+                except KeyError as e:
+                    raise KeyError(
+                        f"Delta model is missing weights for layer: {name}"
+                    ) from e
                except Exception as e:
                    raise RuntimeError(
                        f"Failed to add delta weights to layer {name}. Error: {e}"
-                    )
+                    ) from e

            del _model_delta

-        return None
-
    def _create_tokenizer(
        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
-        tokenizer: Optional[
-            Union[
-                str,
-                transformers.PreTrainedTokenizer,
-                transformers.PreTrainedTokenizerFast,
-            ]
-        ],
-        revision: Optional[str] = "main",
-        trust_remote_code: Optional[bool] = False,
-        use_fast_tokenizer: Optional[bool] = True,
-        gguf_file: Optional[str] = None,
-        add_bos_token: Optional[bool] = False,
-        subfolder: Optional[str] = "",
+        pretrained: str | transformers.PreTrainedModel,
+        tokenizer: str
+        | transformers.PreTrainedTokenizer
+        | transformers.PreTrainedTokenizerFast
+        | None,
+        revision: str | None = "main",
+        trust_remote_code: bool | None = False,
+        use_fast_tokenizer: bool | None = True,
+        gguf_file: str | None = None,
+        add_bos_token: bool | None = False,
+        subfolder: str | None = "",
    ) -> None:
-        """
-        Helper method during initialization.
+        """Helper method during initialization.

        Create a tokenizer object corresponding to the correct
        tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
@@ -745,8 +758,12 @@ class HFLM(TemplateLM):
                )
            else:
                assert isinstance(
-                    tokenizer, transformers.PreTrainedTokenizer
-                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                    tokenizer,
+                    (
+                        transformers.PreTrainedTokenizer,
+                        transformers.PreTrainedTokenizerFast,
+                    ),
+                )
                self.tokenizer = tokenizer
        else:
            # Get tokenizer based on 'pretrained'
@@ -758,9 +775,8 @@ class HFLM(TemplateLM):
            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
                model_name, **kwargs
            )
-        return None

-    def _detect_batch_size(self, requests=None, pos: int = 0):
+    def _detect_batch_size(self, requests: Sequence | None = None, pos: int = 0):
        if requests:
            _, context_enc, continuation_enc = requests[pos]
            max_length = len(
@@ -775,7 +791,7 @@ class HFLM(TemplateLM):

        # if OOM, then halves batch_size and tries again
        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
-        def forward_batch(batch_size):
+        def forward_batch(batch_size: int):
            if self.backend == "seq2seq":
                length = max(max_context_enc, max_cont_enc)
                batched_conts = torch.ones(
@@ -822,8 +838,11 @@ class HFLM(TemplateLM):
        return batch_size

    def tok_encode(
-        self, string: str, left_truncate_len=None, add_special_tokens=None
-    ) -> List[int]:
+        self,
+        string: str,
+        left_truncate_len: int | None = None,
+        add_special_tokens: bool | None = None,
+    ) -> list[int]:
        """ """
        # default for None - empty dict, use predefined tokenizer param
        # used for all models except for CausalLM or predefined value
@@ -849,11 +868,11 @@ class HFLM(TemplateLM):

    def tok_batch_encode(
        self,
-        strings: List[str],
+        strings: list[str],
        padding_side: str = "left",
-        left_truncate_len: int = None,
+        left_truncate_len: int | None = None,
        truncation: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
        old_padding_side = self.tokenizer.padding_side
        self.tokenizer.padding_side = padding_side
@@ -872,7 +891,7 @@ class HFLM(TemplateLM):
        if left_truncate_len:
            original_lengths = encoding["input_ids"].size(1)
            if original_lengths > left_truncate_len:
-                eval_logger.warn(
+                eval_logger.warning(
                    f"Left truncation applied. Original sequence length was {original_lengths}, "
                    f"truncating to last {left_truncate_len} tokens. Some content will be lost.",
                )
@@ -884,11 +903,17 @@ class HFLM(TemplateLM):

        return encoding["input_ids"], encoding["attention_mask"]

-    def tok_decode(self, tokens, skip_special_tokens=True):
+    def tok_decode(self, tokens: Iterator[list[str]], skip_special_tokens: bool = True):
        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)

-    def _model_call(self, inps, attn_mask=None, labels=None):
+    def _model_call(
+        self,
+        inps: torch.Tensor,
+        attn_mask: torch.Tensor | None = None,
+        labels: torch.Tensor | None = None,
+    ) -> torch.Tensor:
        """
+
        :param inps: torch.Tensor
            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
            [batch, sequence_ctx]. the size of sequence may vary from call to call
@@ -902,27 +927,40 @@ class HFLM(TemplateLM):
            A torch tensor of shape [batch, sequence, vocab] with the
        logits returned from the model's decoder
        """
-        with torch.no_grad():
+        with (
+            torch.no_grad(),
+            torch.autocast(
+                device_type=self.device.type,
+                dtype=self.mixed_precision_dtype,
+                enabled=self.mixed_precision_dtype is not None,
+            ),
+        ):
            if attn_mask is not None or labels is not None:
                assert attn_mask is not None and labels is not None
-                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
+                assert transformers.AutoModelForSeq2SeqLM == self.AUTO_MODEL_CLASS
                return self.model(
                    input_ids=inps, attention_mask=attn_mask, labels=labels
                ).logits
-            else:
-                assert self.AUTO_MODEL_CLASS in (
-                    transformers.AutoModelForCausalLM,
-                    transformers.AutoModelForVision2Seq,
-                )
-                return self.model(inps).logits

-    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+            assert self.AUTO_MODEL_CLASS in (
+                transformers.AutoModelForCausalLM,
+                transformers.AutoModelForVision2Seq,
+            )
+            return self.model(inps).logits
+
+    def _model_generate(
+        self,
+        context,
+        max_length: int,
+        stop: list[str],
+        **generation_kwargs: dict[str, Any],
+    ) -> torch.Tensor:
        # temperature = 0.0 if not set
        # if do_sample is false and temp==0.0:
        # remove temperature, as do_sample=False takes care of this
        # and we don't want a warning from HF
        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
-        do_sample = generation_kwargs.get("do_sample", None)
+        do_sample = generation_kwargs.get("do_sample")

        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
@@ -934,17 +972,25 @@ class HFLM(TemplateLM):
        stopping_criteria = stop_sequences_criteria(
            self.tokenizer, stop, context.shape[1], context.shape[0]
        )
-        return self.model.generate(
-            input_ids=context,
-            max_length=max_length,
-            stopping_criteria=stopping_criteria,
-            pad_token_id=self.tokenizer.pad_token_id,
-            use_cache=True,
-            **generation_kwargs,
-        )
+        with torch.autocast(
+            device_type=self.device.type,
+            dtype=self.mixed_precision_dtype,
+            enabled=self.mixed_precision_dtype is not None,
+        ):
+            return self.model.generate(
+                input_ids=context,
+                max_length=max_length,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=self.tokenizer.pad_token_id,
+                use_cache=True,
+                **generation_kwargs,
+            )

    def _select_cont_toks(
-        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
+        self,
+        logits: torch.Tensor,
+        contlen: int | None = None,
+        inplen: int | None = None,
    ) -> torch.Tensor:
        if self.backend == "causal":
            assert contlen and inplen, (
@@ -964,8 +1010,8 @@ class HFLM(TemplateLM):
        return logits

    def loglikelihood_rolling(
-        self, requests: List[Instance], disable_tqdm: bool = False
-    ) -> List[float]:
+        self, requests: list[Instance], disable_tqdm: bool = False
+    ) -> list[float]:
        adaptive_batch_size = None
        if self.batch_size == "auto":
            # using rolling window with maximum context
@@ -984,7 +1030,7 @@ class HFLM(TemplateLM):
                disable=(disable_tqdm or (self.rank != 0)),
            )
        ):
-            rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
+            rolling_token_windows: list[tuple[list[int], list[int]]] = list(
                map(
                    utils.make_disjoint_window,
                    utils.get_rolling_token_windows(
@@ -1068,15 +1114,15 @@ class HFLM(TemplateLM):

    def _loglikelihood_tokens(
        self,
-        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        requests: list[tuple[tuple[str, str], list[int], list[int]]],
        disable_tqdm: bool = False,
-        override_bs: int = None,
-    ) -> List[Tuple[float, bool]]:
+        override_bs: int | None = None,
+    ) -> list[tuple[float, bool]]:
        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
        res = []

-        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
-            """Defines the key for the sorted method"""
+        def _collate(req: tuple[tuple[str, str], list[int], list[int]]):
+            """Defines the key for the sorted method."""
            # the negative sign on len(toks) sorts descending - this has a few advantages:
            # - time estimates will always be over not underestimates, which is more useful for planning
            # - to know the size of a batch when going through the list, you know the first one is always the batch
@@ -1087,8 +1133,8 @@ class HFLM(TemplateLM):
            toks = req[1] + req[2]
            return -len(toks), tuple(toks)

-        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
-            """Defines the key to group and lookup one-token continuations"""
+        def _lookup_one_token_cont(req: tuple[tuple[str, str], list[int], list[int]]):
+            """Defines the key to group and lookup one-token continuations."""
            # Use with group_by="contexts" (optional)"
            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
            # speeds up some multiple-choice tasks proportionally to the number of choices.
@@ -1261,7 +1307,7 @@ class HFLM(TemplateLM):
                # original args. Otherwise, expands the logits batch dimension and yields each
                # batch along with matching continuation tokens and prompt strings.
                # logits -> [1, seq, vocab]
-                for request_str, cont_toks, logits in re_ord.get_cache(
+                for request_str, cont_toks, logits in re_ord.get_cache(  # noqa
                    req_str=request_str,
                    cxt_toks=ctx_tokens,
                    cont_toks=cont_toks,
@@ -1302,11 +1348,11 @@ class HFLM(TemplateLM):
        return re_ord.get_original(res)

    def generate_until(
-        self, requests: List[Instance], disable_tqdm: bool = False
-    ) -> List[str]:
+        self, requests: list[Instance], disable_tqdm: bool = False
+    ) -> list[str]:
        res = []

-        def _collate(req: Tuple[str, dict]):
+        def _collate(req: tuple[str, dict]):
            """Defines the key for the sorted method"""
            # the negative sign on len(toks) sorts descending - this has a few advantages:
            # - time estimates will always be over not underestimates, which is more useful for planning
@@ -1366,10 +1412,10 @@ class HFLM(TemplateLM):
                # add EOS token to stop sequences
                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
            else:
-                raise ValueError(
+                raise TypeError(
                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                )
-            if "max_gen_toks" in kwargs.keys():
+            if "max_gen_toks" in kwargs:
                max_gen_toks = kwargs.pop("max_gen_toks")
            else:
                max_gen_toks = self.max_gen_toks
@@ -1411,15 +1457,30 @@ class HFLM(TemplateLM):
                if self.backend == "causal":
                    cont_toks = cont_toks[context_enc.shape[1] :]

-                s = self.tok_decode(cont_toks)
+                # Handle integer think_end_token: find last occurrence and strip tokens after it
+                if isinstance(self.think_end_token, int):
+                    think_token_indices = [
+                        i
+                        for i, token in enumerate(cont_toks)
+                        if token == self.think_end_token
+                    ]
+                    if think_token_indices:
+                        cont_toks = cont_toks[think_token_indices[-1] + 1 :]

-                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                for term in until:
-                    if len(term) > 0:
-                        # ignore '' separator,
-                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
-                        s = s.split(term)[0]
+                s = self.tok_decode(cont_toks)

+                # Strip leading whitespace if we removed thinking tokens
+                if isinstance(self.think_end_token, int):
+                    s = s.lstrip()
+
+                # Apply post-processing: remove stop sequences and string-based thinking tokens
+                s = postprocess_generated_text(
+                    generation=s,
+                    stop=until,
+                    think_end_token=self.think_end_token
+                    if isinstance(self.think_end_token, str)
+                    else None,
+                )
                res.append(s)

                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
@@ -1432,17 +1493,16 @@ class HFLM(TemplateLM):
        return res

    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+        self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True
    ) -> str:
-        """
-        Method to apply a chat template to a list of chat history between user and model.
-        """
+        """Method to apply a chat template to a list of chat history between user and model."""
        try:
            chat_templated = self.tokenizer.apply_chat_template(
                chat_history,
                tokenize=False,
                add_generation_prompt=add_generation_prompt,
                continue_final_message=not add_generation_prompt,
+                **self.chat_template_args,
            )
        except jinja2.exceptions.TemplateError:
            eval_logger.warning(
@@ -1454,14 +1514,13 @@ class HFLM(TemplateLM):
                tokenize=False,
                add_generation_prompt=add_generation_prompt,
                continue_final_message=not add_generation_prompt,
+                **self.chat_template_args,
            )

        return chat_templated

    def get_model_info(self) -> dict:
-        """
-        Method to get Hugging Face model information for experiment reproducibility.
-        """
+        """Method to get Hugging Face model information for experiment reproducibility."""

        def get_model_num_params(model) -> int:
            if hasattr(model, "num_parameters"):

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -16,8 +16,8 @@ eval_logger = logging.getLogger(__name__)
 class LocalCompletionsAPI(TemplateAPI):
    def __init__(
        self,
-        base_url=None,
-        tokenizer_backend="huggingface",
+        base_url: str = None,
+        tokenizer_backend: str = "huggingface",
        **kwargs,
    ):
        super().__init__(
@@ -108,9 +108,9 @@ class LocalCompletionsAPI(TemplateAPI):
 class LocalChatCompletion(LocalCompletionsAPI):
    def __init__(
        self,
-        base_url=None,
-        tokenizer_backend=None,
-        tokenized_requests=False,
+        base_url: str = None,
+        tokenizer_backend: str = None,
+        tokenized_requests: bool = False,
        **kwargs,
    ):
        eval_logger.warning(
@@ -236,6 +236,7 @@ class OpenAIChatCompletion(LocalChatCompletion):
            eval_logger.warning(
                "o1 models do not support `stop` and only support temperature=1"
            )
+
        super().__init__(
            base_url=base_url,
            tokenizer_backend=tokenizer_backend,

--- a/lm_eval/models/sglang_causallms.py
+++ b/lm_eval/models/sglang_causallms.py
@@ -11,6 +11,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
    Collator,
    handle_stop_sequences,
+    postprocess_generated_text,
 )
 from lm_eval.utils import (
    get_rolling_token_windows,
@@ -59,6 +60,8 @@ class SGLangLM(TemplateLM):
        dp_size: int = 1,
        tp_size: int = 1,
        prefix_token_id: Optional[int] = None,
+        # End marker for thinking tags - splits to get response after this token (if provided).
+        think_end_token: Optional[str] = None,
        **kwargs,
    ):
        super().__init__()
@@ -74,6 +77,7 @@ class SGLangLM(TemplateLM):
            "Either context_length or max_model_len may be provided, but not both"
        )
        # Initialize your sglang model here
+        self.think_end_token = think_end_token
        self._max_length = (
            max_model_len if max_model_len is not None else context_length
        )
@@ -263,6 +267,9 @@ class SGLangLM(TemplateLM):
            # cache generations
            for output, context in zip(cont, context):
                generated_text = output.get("text", "")
+                generated_text = postprocess_generated_text(
+                    generated_text, until, self.think_end_token
+                )
                res.append(generated_text)
                self.cache_hook.add_partial(
                    "generate_until", (context, gen_kwargs), generated_text

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -852,3 +852,32 @@ def truncate_tokens(
        right_length = max_length - left_length
        return tokens[:left_length] + tokens[-right_length:]
    return None
+
+
+def postprocess_generated_text(
+    generation: str, stop: Union[list[str], str, None], think_end_token: Optional[str]
+) -> str:
+    """
+    Post-processes the generated text by stripping stop sequences and optional thinking markers.
+
+    Args:
+        generation (str): The generated text to be processed.
+        stop (Optional[list[str]]): Stop sequence(s) to remove. Text is truncated
+            at the first occurrence of any stop sequence.
+        think_end_token (Optional[str]): Token marking end of thinking section. If provided,
+            returns only the text after this token (discarding thinking content).
+
+    Returns:
+        str: The processed generation - text before stop sequences and after thinking sections.
+    """
+    if stop:
+        stop = [stop] if isinstance(stop, str) else stop
+        for term in stop:
+            if len(term) > 0:
+                # ignore '' separator,
+                # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                generation = generation.split(term)[0]
+    if think_end_token:
+        generation = generation.split(think_end_token)[-1].lstrip()
+
+    return generation
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -22,6 +22,7 @@ from lm_eval.models.utils import (
    Collator,
    configure_pad_token,
    handle_stop_sequences,
+    postprocess_generated_text,
    undistribute,
 )
 from lm_eval.utils import (
@@ -130,10 +131,14 @@ class VLLM(TemplateLM):
        max_model_len: int = None,
        seed: int = 1234,
        gpu_memory_utilization: float = 0.9,
-        device: str = "cuda",
        data_parallel_size: int = 1,
        lora_local_path: str = None,
-        enable_thinking: bool = False,
+        # VLLM: enable thinking tags in the prompt.
+        enable_thinking: bool = True,
+        chat_template_args: Optional[dict] = None,
+        # End marker for thinking tags - splits to get response after this token (if provided).
+        think_end_token: Optional[str] = None,
+        max_lora_rank: int = 16,
        **kwargs,
    ):
        super().__init__()
@@ -147,6 +152,8 @@ class VLLM(TemplateLM):
        assert max_length is None or max_model_len is None, (
            "Either max_length or max_model_len may be provided, but not both"
        )
+        kwargs.pop("device", None)
+        self.think_end_token = think_end_token
        self.V1 = os.environ.get("VLLM_USE_V1", "1") != "0"
        self._max_length = max_model_len if max_model_len is not None else max_length
        self.tensor_parallel_size = int(tensor_parallel_size)
@@ -166,7 +173,8 @@ class VLLM(TemplateLM):
            "swap_space": int(swap_space),
            "quantization": quantization,
            "seed": int(seed),
-            "device": str(device),
+            "enable_lora": True if lora_local_path else False,
+            "max_lora_rank": int(max_lora_rank),
        }
        self.model_args.update(kwargs)
        self.batch_size = (
@@ -201,7 +209,10 @@ class VLLM(TemplateLM):
            add_bos_token=add_bos_token,
        )
        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self._config)
-        self.enable_thinking = enable_thinking
+        self.chat_template_args = chat_template_args or {}
+        self.enable_thinking = self.chat_template_args.pop(
+            "enable_thinking", enable_thinking
+        )
        self.add_bos_token = add_bos_token
        if "gemma" in pretrained.lower():
            self.add_bos_token = True
@@ -309,6 +320,7 @@ class VLLM(TemplateLM):
                continue_final_message=not add_generation_prompt,
                chat_template=self.hf_chat_template,
                enable_thinking=self.enable_thinking,
+                **self.chat_template_args,
            )
        except jinja2.exceptions.TemplateError:
            eval_logger.warning(
@@ -321,6 +333,7 @@ class VLLM(TemplateLM):
                continue_final_message=not add_generation_prompt,
                chat_template=self.hf_chat_template,
                enable_thinking=self.enable_thinking,
+                **self.chat_template_args,
            )

        return chat_templated
@@ -627,11 +640,11 @@ class VLLM(TemplateLM):

            # cache generations
            for output, context in zip(cont, context):
-                generated_text = output.outputs[0].text
+                generated_text: str = output.outputs[0].text
                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                for term in until:
-                    if len(term) > 0:
-                        generated_text = generated_text.split(term)[0]
+                generated_text = postprocess_generated_text(
+                    generated_text, until, self.think_end_token
+                )
                res.append(generated_text)
                self.cache_hook.add_partial(
                    "generate_until", (context, gen_kwargs), generated_text

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -5,168 +5,172 @@

 For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.

-| Task Family                                                              | Description                                                                                                                                                                                                                                                                                                                            | Language(s)                                                                                                           |
-|--------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------|
-| [aclue](aclue/README.md)                                                 | Tasks focusing on ancient Chinese language understanding and cultural aspects.                                                                                                                                                                                                                                                         | Ancient Chinese                                                                                                       |
-| [acp_bench](acpbench/README.md) | Tasks evaluating the reasoning ability about Action, Change, and Planning | English |
-| [acp_bench_hard](acpbench/README.md) | Tasks evaluating the reasoning ability about Action, Change, and Planning | English |
-| [aexams](aexams/README.md)                                               | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic                                                                                                                        |
-| [agieval](agieval/README.md)                                             | Tasks involving historical data or questions related to history and historical texts.                                                                                                                                                                                                                                                  | English, Chinese                                                                                                      |
-| [anli](anli/README.md)                                                   | Adversarial natural language inference tasks designed to test model robustness.                                                                                                                                                                                                                                                        | English                                                                                                               |
-| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md)     | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated.                                                                 | Arabic (Some MT)                                                                                                      |
-| [arabic_leaderboard_light](arabic_leaderboard_light/README.md)           | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT)                                                                                                      |
-| [arabicmmlu](arabicmmlu/README.md)                                       | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects.                                                                                                                                                                                                                                                      | Arabic                                                                                                                |
-| [ArabCulture](arab_culture/README.md) | Benchmark for evaluating modeles' commonsense cultural knowledge across different 13 different Arab Countries. | Arabic |
-[AraDICE](aradice/README.md)                                             | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs).                                                                                                                                                                                                     | Arabic                                                                                                                |
-| [arc](arc/README.md)                                                     | Tasks involving complex reasoning over a diverse set of questions.                                                                                                                                                                                                                                                                     | English                                                                                                               |
-| [arithmetic](arithmetic/README.md)                                       | Tasks involving numerical computations and arithmetic reasoning.                                                                                                                                                                                                                                                                       | English                                                                                                               |
-| [asdiv](asdiv/README.md)                                                 | Tasks involving arithmetic and mathematical reasoning challenges.                                                                                                                                                                                                                                                                      | English                                                                                                               |
-| [babi](babi/README.md)                                                   | Tasks designed as question and answering challenges based on simulated stories.                                                                                                                                                                                                                                                        | English                                                                                                               |
-| [basque_bench](basque_bench/README.md)                                   | Collection of tasks in Basque encompassing various evaluation areas.                                                                                                                                                                                                                                                                   | Basque                                                                                                                |
-| [basqueglue](basqueglue/README.md)                                       | Tasks designed to evaluate language understanding in Basque language.                                                                                                                                                                                                                                                                  | Basque                                                                                                                |
-| [bbh](bbh/README.md)                                                     | Tasks focused on deep semantic understanding through hypothesization and reasoning.                                                                                                                                                                                                                                                    | English, German                                                                                                       |
-| [bbq](bbq/README.md)                                                     | A question-answering benchmark designed to measure social biases in language models across various demographic categories and contexts.                                                                                                                                                                                                  | English                                                                                                               |
-| [belebele](belebele/README.md)                                           | Language understanding tasks in a variety of languages and scripts.                                                                                                                                                                                                                                                                    | Multiple (122 languages)                                                                                              |
-| benchmarks                                                               | General benchmarking tasks that test a wide range of language understanding capabilities.                                                                                                                                                                                                                                              |                                                                                                                       |
-| [bertaqa](bertaqa/README.md)                                             | Local Basque cultural trivia QA tests in English and Basque languages.                                                                                                                                                                                                                                                                 | English, Basque, Basque (MT)                                                                                          |
-| [bigbench](bigbench/README.md)                                           | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models.                                                                                                                                                                                                                                              | Multiple                                                                                                              |
-| [blimp](blimp/README.md)                                                 | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities.                                                                                                                                                                                                                                              | English                                                                                                               |
-| [c4](c4/README.md)                                                 | Tasks based on a colossal, cleaned version of Common Crawl's web crawl corpus to assess models' language modeling capabilities.                                                                                                                                                                                                                                              | English                                                                                                               |
-| [careqa](careqa/README.md)                                               | Multiple choice and open-ended medical question answering based on the Spanish Specialised Healthcare Training (MIR) exams.                                                                                                                                                                                                            | English, Spanish                                                                                                       |
-| [catalan_bench](catalan_bench/README.md)                                 | Collection of tasks in Catalan encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Catalan                                                                                                               |
-| [ceval](ceval/README.md)                                                 | Tasks that evaluate language understanding and reasoning in an educational context.                                                                                                                                                                                                                                                    | Chinese                                                                                                               |
-| [cmmlu](cmmlu/README.md)                                                 | Multi-subject multiple choice question tasks for comprehensive academic assessment.                                                                                                                                                                                                                                                    | Chinese                                                                                                               |
-| code_x_glue                                                              | Tasks that involve understanding and generating code across multiple programming languages.                                                                                                                                                                                                                                            | Go, Java, JS, PHP, Python, Ruby                                                                                       |
-| [commonsense_qa](commonsense_qa/README.md)                               | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge.                                                                                                                                                                                                                                                       | English                                                                                                               |
-| [copal_id](copal_id/README.md)                United States              | Indonesian causal commonsense reasoning dataset that captures local nuances.                                                                                                                                                                                                                                                           | Indonesian                                                                                                            |
-| [coqa](coqa/README.md)                                                   | Conversational question answering tasks to test dialog understanding.                                                                                                                                                                                                                                                                  | English                                                                                                               |
-| [crows_pairs](crows_pairs/README.md)                                     | Tasks designed to test model biases in various sociodemographic groups.                                                                                                                                                                                                                                                                | English, French                                                                                                       |
-| csatqa                                                                   | Tasks related to SAT and other standardized testing questions for academic assessment.                                                                                                                                                                                                                                                 | Korean                                                                                                                |
-| [darija_bench](darija_bench/README.md) | Traditional NLP tasks (Translation, Summariation, etc..) for Moroccan Darija | Moroccan Darija (some MT) |
-| [darijahellaswag](darijahellaswag/README.md) | Moroccan Darija version of HellaSwag. | Moroccan Darija (MT) |
-| [darijammlu](darijammlu/README.md)| Multiple-choice QA in Moroccan Darija (an Arabic dialect).  | Moroccan Darija (MT) |
-| [drop](drop/README.md)                                                   | Tasks requiring numerical reasoning, reading comprehension, and question answering.                                                                                                                                                                                                                                                    | English                                                                                                               |
-| [eq_bench](eq_bench/README.md)                                           | Tasks focused on equality and ethics in question answering and decision-making.                                                                                                                                                                                                                                                        | English                                                                                                               |
-| [eus_exams](eus_exams/README.md)                                         | Tasks based on various professional and academic exams in the Basque language.                                                                                                                                                                                                                                                         | Basque                                                                                                                |
-| [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics.                                                                                                                                                                                                                                                       | Basque                                                                                                                |
-| [eus_reading](eus_reading/README.md)                                     | Reading comprehension tasks specifically designed for the Basque language.                                                                                                                                                                                                                                                             | Basque                                                                                                                |
-| [eus_trivia](eus_trivia/README.md)                                       | Trivia and knowledge testing tasks in the Basque language.                                                                                                                                                                                                                                                                             | Basque                                                                                                                |
-| [evalita_LLM](evalita_llm/README.md)                                     | A native Italian benchmark with diverse tasks formats and multiple prompts.                                                                                                                                                                                                                                                            | Italian                                                                                                               |
-| [fda](fda/README.md)                                                     | Tasks for extracting key-value pairs from FDA documents to test information extraction.                                                                                                                                                                                                                                                | English                                                                                                               |
-| [fld](fld/README.md)                                                     | Tasks involving free-form and directed dialogue understanding.                                                                                                                                                                                                                                                                         | English                                                                                                               |
-| [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French.                                                                                                                                                                                                                                                                  | French                                                                                                                |
-| [galician_bench](galician_bench/README.md)                               | Collection of tasks in Galician encompassing various evaluation areas.                                                                                                                                                                                                                                                                 | Galician                                                                                                              |
-| [global_mmlu](global_mmlu/README.md)                                     | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits.                                                                                                                                                                                                           | Multiple (15 languages)                                                                                               |
-| [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities.                                                                                                                                                                                                                                                  | English                                                                                                               |
-| [gpqa](gpqa/README.md)                                                   | Tasks designed for general public question answering and knowledge verification.                                                                                                                                                                                                                                                       | English                                                                                                               |
-| [gsm8k](gsm8k/README.md)                                                 | A benchmark of grade school math problems aimed at evaluating reasoning capabilities.                                                                                                                                                                                                                                                  | English                                                                                                               |
-| [groundcocoa](groundcocoa/README.md)                                     | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task. | English                                                                                                                       |
-| [haerae](haerae/README.md)                                               | Tasks focused on assessing detailed factual and historical knowledge.                                                                                                                                                                                                                                                                  | Korean                                                                                                                |
-| [headqa](headqa/README.md)                                               | A high-level education-based question answering dataset to test specialized knowledge.                                                                                                                                                                                                                                                 | Spanish, English                                                                                                      |
-| [hellaswag](hellaswag/README.md)                                         | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity.                                                                                                                                                                                                                                             | English                                                                                                               |
-| [hendrycks_ethics](hendrycks_ethics/README.md)                           | Tasks designed to evaluate the ethical reasoning capabilities of models.                                                                                                                                                                                                                                                               | English                                                                                                               |
-| [hendrycks_math](hendrycks_math/README.md)                               | Mathematical problem-solving tasks to test numerical reasoning and problem-solving.                                                                                                                                                                                                                                                    | English                                                                                                               |
-| [histoires_morales](histoires_morales/README.md)                         | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                                    | French (Some MT)                                                                                                      |
-| [hrm8k](hrm8k/README.md)                                                 | A challenging bilingual math reasoning benchmark for Korean and English.                                                                                                                                                                                                                                                               | Korean (Some MT), English (Some MT)                                                                                   |
-| [humaneval](humaneval/README.md)                                         | Code generation task that measure functional correctness for synthesizing programs from docstrings.                                                                                                                                                                                                                                    | Python                                                                                                                |
-| [ifeval](ifeval/README.md)                                               | Interactive fiction evaluation tasks for narrative understanding and reasoning.                                                                                                                                                                                                                                                        | English                                                                                                               |
-| [inverse_scaling](inverse_scaling/README.md)                             | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse.                                                                                                                                                                                                            | English                                                                                                               |
-| [japanese_leaderboard](japanese_leaderboard/README.md)                   | Japanese language understanding tasks to benchmark model performance on various linguistic aspects.                                                                                                                                                                                                                                    | Japanese                                                                                                              |
-| [jsonschema_bench](jsonschema_bench/README.md)                           | Evaluate the ability of LLMs to generate JSON objects that conform to a given JSON schema, including API, configuration files, and other structured data formats.                                                                                                                                                                      | JSON                                                                                                                  |
-| [kbl](kbl/README.md)                                                     | Korean Benchmark for Legal Language Understanding.                                                                                                                                                                                                                                                                                     | Korean                                                                                                                |
-| [kmmlu](kmmlu/README.md)                                                 | Knowledge-based multi-subject multiple choice questions for academic evaluation.                                                                                                                                                                                                                                                       | Korean                                                                                                                |
-| [kobest](kobest/README.md)                                               | A collection of tasks designed to evaluate understanding in Korean language.                                                                                                                                                                                                                                                           | Korean                                                                                                                |
-| [kormedmcqa](kormedmcqa/README.md)                                       | Medical question answering tasks in Korean to test specialized domain knowledge.                                                                                                                                                                                                                                                       | Korean                                                                                                                |
-| [lambada](lambada/README.md)                                             | Tasks designed to predict the endings of text passages, testing language prediction skills.                                                                                                                                                                                                                                            | English                                                                                                               |
-| [lambada_cloze](lambada_cloze/README.md)                                 | Cloze-style LAMBADA dataset.                                                                                                                                                                                                                                                                                                           | English                                                                                                               |
-| [lambada_multilingual](lambada_multilingual/README.md)                   | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`.                                                                                                                                                                                      | German, English, Spanish, French, Italian                                                                             |
-| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`.                                                                                                                                                                                         | German, English, Spanish, French, Italian, Dutch, Portuguese                                                          |
-| [leaderboard](leaderboard/README.md)                                     | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time                                                                                                                                          | English                                                                                                               |
-| [lingoly](lingoly/README.md)                                             | Challenging logical reasoning benchmark in low-resource languages with controls for memorization                                                                                                                                                                                                                                       | English, Multilingual                                                                                                 |
-| [logiqa](logiqa/README.md)                                               | Logical reasoning tasks requiring advanced inference and deduction.                                                                                                                                                                                                                                                                    | English, Chinese                                                                                                      |
-| [logiqa2](logiqa2/README.md)                                             | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination.                                                                                                                                                                                                                                              | English, Chinese                                                                                                      |
-| [mastermind](mastermind/README.md)                                       | Reasoning benchmark based on the board game of Mastermind.                                                                                                                                                                                                                                                                             | English                                                                                                               |
-| [mathqa](mathqa/README.md)                                               | Question answering tasks involving mathematical reasoning and problem-solving.                                                                                                                                                                                                                                                         | English                                                                                                               |
-| [mbpp](mbpp/README.md)                                                   | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions.                                                                                                                                                                                                                    | Python                                                                                                                |
-| [meddialog](meddialog/README.md)                                         | Medical open-ended QA and Question Entailment stemming from the MedDialog dataset.                                                                                                                                                                                                                                                     | English                                                                                                               |
-| [medtext](medtext/README.md)                                             | Medical open-ended QA from the MedText Clinical Notes dataset.                                                                                                                                                                                                                                                                         | English                                                                                                               |
-| [mimic_repsum](mimic_repsum/README.md)                                   | Medical report summarization from the MIMIC-III dataset.                                                                                                                                                                                                                                                                               | English                                                                                                               |
-| [mc_taco](mc_taco/README.md)                                             | Question-answer pairs that require temporal commonsense comprehension.                                                                                                                                                                                                                                                                 | English                                                                                                               |
-| [med_concepts_qa](med_concepts_qa/README.md)                             | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept.                                                                                                                                                                                                                   | English                                                                                                               |
-| [metabench](metabench/README.md)                                         | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait.                                                                                                                                                                                | English                                                                                                               |
-| [mediqa_qa2019](mediqa_qa2019/README.md)                                 | Open-ended healthcare question answering benchmark from the MEDIQA 2019 challenge.                                                                                                                                                                                                                                                     | English                                                                                                               |
-| medmcqa                                                                  | Medical multiple choice questions assessing detailed medical knowledge.                                                                                                                                                                                                                                                                | English                                                                                                               |
-| medqa                                                                    | Multiple choice question answering based on the United States Medical License Exams.                                                                                                                                                                                                                                                   |                                                                                                                       |
-| [meqsum](meqsum/README.md)                                               | Healtcare Question Entailment benchmark from the MeqSum dataset.                                                                                                                                                                                                                                                                       |                                                                                                                       |
-| [mgsm](mgsm/README.md)                                                   | Benchmark of multilingual grade-school math problems.                                                                                                                                                                                                                                                                                  | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu                                   |
-| [minerva_math](minerva_math/README.md)                                   | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills.                                                                                                                                                                                                                                                    | English                                                                                                               |
-| [mlqa](mlqa/README.md)                                                   | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance.                                                                                                                                                                                                                         | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese                                               |
-| [mmlu](mmlu/README.md)                                                   | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported.                                                                                                                                                                                                               | English                                                                                                               |
-| [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options.                                                                                                                                                                                                | English                                                                                                               |
-| [mmlu-pro-plus](mmlu-pro-plus/README.md)                                 | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                    | English                                                                                                               |
-| [mmlu_prox](mmlu_prox/README.md)                                         | A multilingual benchmark that extends MMLU-Pro to multiple typologically diverse languages with human validation.                                                                                                                                                                                                                      | English, Japanese, Chinese, Korean, French, German, Spanish, Portuguese, Swahili, Thai, Arabic, Hindi, Bengali        |
-| [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous.                                                                                                                                                                                                                                                                                        | English                                                                                                               |
-| model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns.                                                                                                                                                                                                                                                     |                                                                                                                       |
-| [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                      | English                                                                                                               |
-| [mts_dialog](mts_dialog/README.md)                                       | Open-ended healthcare QA from the MTS-Dialog dataset.                                                                                                                                                                                                                                                                                  | English                                                                                                               |
-| [mutual](mutual/README.md)                                               | A retrieval-based dataset for multi-turn dialogue reasoning.                                                                                                                                                                                                                                                                           | English                                                                                                               |
-| [noreval](noreval/README.md)                                             | A human-created Norwegian language understanding and generation benchmark.                                                                                                                                                                                                                                                             | Norwegian (Bokmål and Nynorsk)                                                                                        |
-| [nq_open](nq_open/README.md)                                             | Open domain question answering tasks based on the Natural Questions dataset.                                                                                                                                                                                                                                                           | English                                                                                                               |
-| [okapi/arc_multilingual](okapi/arc_multilingual/README.md)               | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (31 languages) **Machine Translated.**                                                                       |
-| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md)   | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (30 languages) **Machine Translated.**                                                                       |
-| okapi/mmlu_multilingual                                                  | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (34 languages) **Machine Translated.**                                                                       |
-| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (31 languages) **Machine Translated.**                                                                       |
-| [olaph](olaph/README.md)                                                 | Open-ended medical factuality Question Answering from the OLAPH dataset.                                                                                                                                                                                                                                                               | English                                                                                                               |
-| [openbookqa](openbookqa/README.md)                                       | Open-book question answering tasks that require external knowledge and reasoning.                                                                                                                                                                                                                                                      | English                                                                                                               |
-| [paloma](paloma/README.md)                                               | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit.                                                                                                                                                 | English                                                                                                               |
-| [paws-x](paws-x/README.md)                                               | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities.                                                                                                                                                                                                                                                   | English, French, Spanish, German, Chinese, Japanese, Korean                                                           |
-| [pile](pile/README.md)                                                   | Open source language modelling data set that consists of 22 smaller, high-quality datasets.                                                                                                                                                                                                                                            | English                                                                                                               |
-| [pile_10k](pile_10k/README.md)                                           | The first 10K elements of The Pile, useful for debugging models trained on it.                                                                                                                                                                                                                                                         | English                                                                                                               |
-| [piqa](piqa/README.md)                                                   | Physical Interaction Question Answering tasks to test physical commonsense reasoning.                                                                                                                                                                                                                                                  | English                                                                                                               |
-| [polemo2](polemo2/README.md)                                             | Sentiment analysis and emotion detection tasks based on Polish language data.                                                                                                                                                                                                                                                          | Polish                                                                                                                |
-| [portuguese_bench](portuguese_bench/README.md)                           | Collection of tasks in European Portuguese encompassing various evaluation areas.                                                                                                                                                                                                                                                      | Portuguese                                                                                                            |
-| [prost](prost/README.md)                                                 | Tasks requiring understanding of professional standards and ethics in various domains.                                                                                                                                                                                                                                                 | English                                                                                                               |
-| [pubmedqa](pubmedqa/README.md)                                           | Question answering tasks based on PubMed research articles for biomedical understanding.                                                                                                                                                                                                                                               | English                                                                                                               |
-| [qa4mre](qa4mre/README.md)                                               | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning.                                                                                                                                                                                                                                              | English                                                                                                               |
-| [qasper](qasper/README.md)                                               | Question Answering dataset based on academic papers, testing in-depth scientific knowledge.                                                                                                                                                                                                                                            | English                                                                                                               |
-| [race](race/README.md)                                                   | Reading comprehension assessment tasks based on English exams in China.                                                                                                                                                                                                                                                                | English                                                                                                               |
-| realtoxicityprompts                                                      | Tasks to evaluate language models for generating text with potential toxicity.                                                                                                                                                                                                                                                         |                                                                                                                       |
-| [ruler](ruler/README.md)                                                 | RULER is a benchmark for testing how well language models handle long pieces of text. Requires custom arg (see readme)                                                                                                                                                                                                                 | English |
-| [sciq](sciq/README.md)                                                   | Science Question Answering tasks to assess understanding of scientific concepts.                                                                                                                                                                                                                                                       | English                                                                                                               |
-| [score](score/README.md)                                                 | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH)                                                                                                                                                                                                                                   | English                                                                                                               |
-| [scrolls](scrolls/README.md)                                             | Tasks that involve long-form reading comprehension across various domains.                                                                                                                                                                                                                                                             | English                                                                                                               |
-| [simple_cooccurrence_bias](simple_cooccurrence_bias/README.md)           | A metric that evaluates language models for biases based on stereotypical word associations and co-occurrences in text.                                                                                                                                                                                                                | English                                                                                                               |
-| [siqa](siqa/README.md)                                                   | Social Interaction Question Answering to evaluate common sense and social reasoning.                                                                                                                                                                                                                                                   | English                                                                                                               |
-| [spanish_bench](spanish_bench/README.md)                                 | Collection of tasks in Spanish encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Spanish                                                                                                               |
-| [squad_completion](squad_completion/README.md)                           | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs.                                                                                                                                                                                                                                         | English                                                                                                               |
-| [squadv2](squadv2/README.md)                                             | Stanford Question Answering Dataset version 2, a reading comprehension benchmark.                                                                                                                                                                                                                                                      | English                                                                                                               |
-| [storycloze](storycloze/README.md)                                       | Tasks to predict story endings, focusing on narrative logic and coherence.                                                                                                                                                                                                                                                             | English                                                                                                               |
-| [super_glue](super_glue/README.md)                                       | A suite of challenging tasks designed to test a range of language understanding skills.                                                                                                                                                                                                                                                | English                                                                                                               |
-| [swag](swag/README.md)                                                   | Situations With Adversarial Generations, predicting the next event in videos.                                                                                                                                                                                                                                                          | English                                                                                                               |
-| [swde](swde/README.md)                                                   | Information extraction tasks from semi-structured web pages.                                                                                                                                                                                                                                                                           | English                                                                                                               |
-| [tinyBenchmarks](tinyBenchmarks/README.md)                               | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks.                                                                                                                                                                                                                                     | English                                                                                                               |
-| [tmmluplus](tmmluplus/README.md)                                         | An extended set of tasks under the TMMLU framework for broader academic assessments.                                                                                                                                                                                                                                                   | Traditional Chinese                                                                                                   |
-| [toxigen](toxigen/README.md)                                             | Tasks designed to evaluate language models on their propensity to generate toxic content.                                                                                                                                                                                                                                              | English                                                                                                               |
-| [translation](translation/README.md)                                     | Tasks focused on evaluating the language translation capabilities of models.                                                                                                                                                                                                                                                           | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese                       |
-| [triviaqa](triviaqa/README.md)                                           | A large-scale dataset for trivia question answering to test general knowledge.                                                                                                                                                                                                                                                         | English                                                                                                               |
-| [truthfulqa](truthfulqa/README.md)                                       | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses.                                                                                                                                                                                                                                                | English                                                                                                               |
-| [truthfulqa-multi](truthfulqa-multi/README.md) | Is a multilingual version of TruthfulQA, a QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English, Spanish, Catalan, Basque, Galician |
-| [turkishmmlu](turkishmmlu/README.md)                                     | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams.                                                                                                                                                                                                                             | Turkish                                                                                                               |
-| [unitxt](unitxt/README.md)                                               | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI.                                                                                                                                                                                        | English                                                                                                               |
-| [unscramble](unscramble/README.md)                                       | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding.                                                                                                                                                                                                                                              | English                                                                                                               |
-| [webqs](webqs/README.md)                                                 | Web-based question answering tasks designed to evaluate internet search and retrieval.                                                                                                                                                                                                                                                 | English                                                                                                               |
-| [wikitext](wikitext/README.md)                                           | Tasks based on text from Wikipedia articles to assess language modeling and generation.                                                                                                                                                                                                                                                | English                                                                                                               |
-| [winogender](winogender/README.md)                                       | A diagnostic dataset that tests for gender bias in coreference resolution by measuring how models associate pronouns with different occupations.                                                                                                                                                                                         | English                                                                                                               |
-| [winogrande](winogrande/README.md)                                       | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge.                                                                                                                                                                                                                                           | English                                                                                                               |
-| [wmdp](wmdp/README.md)                                                   | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions.                                                                                                                                                                                                          | English                                                                                                               |
-| [wmt2016](wmt2016/README.md)                                             | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages.                                                                                                                                                                                                                                               | English, Czech, German, Finnish, Russian, Romanian, Turkish                                                           |
-| [wsc273](wsc273/README.md)                                               | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution.                                                                                                                                                                                                                                             | English                                                                                                               |
-| [xcopa](xcopa/README.md)                                                 | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages.                                                                                                                                                                                                                                               | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese                   |
+| Task Family                                                              | Description                                                                                                                                                                                                                                                                                                                            | Language(s)                                                                                                                   |
+|--------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
+| [aclue](aclue/README.md)                                                 | Tasks focusing on ancient Chinese language understanding and cultural aspects.                                                                                                                                                                                                                                                         | Ancient Chinese                                                                                                               |
+| [acp_bench](acpbench/README.md)                                          | Tasks evaluating the reasoning ability about Action, Change, and Planning                                                                                                                                                                                                                                                              | English                                                                                                                       |
+| [acp_bench_hard](acpbench/README.md)                                     | Tasks evaluating the reasoning ability about Action, Change, and Planning                                                                                                                                                                                                                                                              | English                                                                                                                       |
+| [aexams](aexams/README.md)                                               | Tasks in Arabic related to various academic exams covering a range of subjects.                                                                                                                                                                                                                                                        | Arabic                                                                                                                        |
+| [agieval](agieval/README.md)                                             | Tasks involving historical data or questions related to history and historical texts.                                                                                                                                                                                                                                                  | English, Chinese                                                                                                              |
+| [anli](anli/README.md)                                                   | Adversarial natural language inference tasks designed to test model robustness.                                                                                                                                                                                                                                                        | English                                                                                                                       |
+| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md)     | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated.                                                                 | Arabic (Some MT)                                                                                                              |
+| [arabic_leaderboard_light](arabic_leaderboard_light/README.md)           | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT)                                                                                                              |
+| [arabicmmlu](arabicmmlu/README.md)                                       | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects.                                                                                                                                                                                                                                                      | Arabic                                                                                                                        |
+| [ArabCulture](arab_culture/README.md)                                    | Benchmark for evaluating modeles' commonsense cultural knowledge across different 13 different Arab Countries.                                                                                                                                                                                                                         | Arabic                                                                                                                        |
+| [AraDICE](aradice/README.md)                                             | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs).                                                                                                                                                                                                     | Arabic                                                                                                                        |
+| [arc](arc/README.md)                                                     | Tasks involving complex reasoning over a diverse set of questions.                                                                                                                                                                                                                                                                     | English                                                                                                                       |
+| [arithmetic](arithmetic/README.md)                                       | Tasks involving numerical computations and arithmetic reasoning.                                                                                                                                                                                                                                                                       | English                                                                                                                       |
+| [asdiv](asdiv/README.md)                                                 | Tasks involving arithmetic and mathematical reasoning challenges.                                                                                                                                                                                                                                                                      | English                                                                                                                       |
+| [babi](babi/README.md)                                                   | Tasks designed as question and answering challenges based on simulated stories.                                                                                                                                                                                                                                                        | English                                                                                                                       |
+| [basque_bench](basque_bench/README.md)                                   | Collection of tasks in Basque encompassing various evaluation areas.                                                                                                                                                                                                                                                                   | Basque                                                                                                                        |
+| [basqueglue](basqueglue/README.md)                                       | Tasks designed to evaluate language understanding in Basque language.                                                                                                                                                                                                                                                                  | Basque                                                                                                                        |
+| [bbh](bbh/README.md)                                                     | Tasks focused on deep semantic understanding through hypothesization and reasoning.                                                                                                                                                                                                                                                    | English, German                                                                                                               |
+| [bbq](bbq/README.md)                                                     | A question-answering benchmark designed to measure social biases in language models across various demographic categories and contexts.                                                                                                                                                                                                | English                                                                                                                       |
+| [belebele](belebele/README.md)                                           | Language understanding tasks in a variety of languages and scripts.                                                                                                                                                                                                                                                                    | Multiple (122 languages)                                                                                                      |
+| benchmarks                                                               | General benchmarking tasks that test a wide range of language understanding capabilities.                                                                                                                                                                                                                                              |                                                                                                                               |
+| [bertaqa](bertaqa/README.md)                                             | Local Basque cultural trivia QA tests in English and Basque languages.                                                                                                                                                                                                                                                                 | English, Basque, Basque (MT)                                                                                                  |
+| [bigbench](bigbench/README.md)                                           | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models.                                                                                                                                                                                                                                              | Multiple                                                                                                                      |
+| [blimp](blimp/README.md)                                                 | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities.                                                                                                                                                                                                                                              | English                                                                                                                       |
+| [c4](c4/README.md)                                                       | Tasks based on a colossal, cleaned version of Common Crawl's web crawl corpus to assess models' language modeling capabilities.                                                                                                                                                                                                        | English                                                                                                                       |
+| [careqa](careqa/README.md)                                               | Multiple choice and open-ended medical question answering based on the Spanish Specialised Healthcare Training (MIR) exams.                                                                                                                                                                                                            | English, Spanish                                                                                                              |
+| [catalan_bench](catalan_bench/README.md)                                 | Collection of tasks in Catalan encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Catalan                                                                                                                       |
+| [ceval](ceval/README.md)                                                 | Tasks that evaluate language understanding and reasoning in an educational context.                                                                                                                                                                                                                                                    | Chinese                                                                                                                       |
+| [cmmlu](cmmlu/README.md)                                                 | Multi-subject multiple choice question tasks for comprehensive academic assessment.                                                                                                                                                                                                                                                    | Chinese                                                                                                                       |
+| code_x_glue                                                              | Tasks that involve understanding and generating code across multiple programming languages.                                                                                                                                                                                                                                            | Go, Java, JS, PHP, Python, Ruby                                                                                               |
+| [commonsense_qa](commonsense_qa/README.md)                               | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge.                                                                                                                                                                                                                                                       | English                                                                                                                       |
+| [copal_id](copal_id/README.md)                United States              | Indonesian causal commonsense reasoning dataset that captures local nuances.                                                                                                                                                                                                                                                           | Indonesian                                                                                                                    |
+| [coqa](coqa/README.md)                                                   | Conversational question answering tasks to test dialog understanding.                                                                                                                                                                                                                                                                  | English                                                                                                                       |
+| [crows_pairs](crows_pairs/README.md)                                     | Tasks designed to test model biases in various sociodemographic groups.                                                                                                                                                                                                                                                                | English, French                                                                                                               |
+| csatqa                                                                   | Tasks related to SAT and other standardized testing questions for academic assessment.                                                                                                                                                                                                                                                 | Korean                                                                                                                        |
+| [darija_bench](darija_bench/README.md)                                   | Traditional NLP tasks (Translation, Summariation, etc..) for Moroccan Darija                                                                                                                                                                                                                                                           | Moroccan Darija (some MT)                                                                                                     |
+| [darijahellaswag](darijahellaswag/README.md)                             | Moroccan Darija version of HellaSwag.                                                                                                                                                                                                                                                                                                  | Moroccan Darija (MT)                                                                                                          |
+| [darijammlu](darijammlu/README.md)                                       | Multiple-choice QA in Moroccan Darija (an Arabic dialect).                                                                                                                                                                                                                                                                             | Moroccan Darija (MT)                                                                                                          |
+| [drop](drop/README.md)                                                   | Tasks requiring numerical reasoning, reading comprehension, and question answering.                                                                                                                                                                                                                                                    | English                                                                                                                       |
+| [egyhellaswag](egyhellaswag/README.md)                                   | Egyptian Arabic (Masri) version of HellaSwag.                                                                                                                                                                                                                                                                                          | Egyptian Arabic (MT)                                                                                                          |
+| [egymmlu](egymmlu/README.md)                                             | Multiple-choice QA in Egyptian Arabic.                                                                                                                                                                                                                                                                                                 | Egyptian Arabic (MT)                                                                                                          |
+| [eq_bench](eq_bench/README.md)                                           | Tasks focused on equality and ethics in question answering and decision-making.                                                                                                                                                                                                                                                        | English                                                                                                                       |
+| [eus_exams](eus_exams/README.md)                                         | Tasks based on various professional and academic exams in the Basque language.                                                                                                                                                                                                                                                         | Basque                                                                                                                        |
+| [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics.                                                                                                                                                                                                                                                       | Basque                                                                                                                        |
+| [eus_reading](eus_reading/README.md)                                     | Reading comprehension tasks specifically designed for the Basque language.                                                                                                                                                                                                                                                             | Basque                                                                                                                        |
+| [eus_trivia](eus_trivia/README.md)                                       | Trivia and knowledge testing tasks in the Basque language.                                                                                                                                                                                                                                                                             | Basque                                                                                                                        |
+| [evalita_LLM](evalita_llm/README.md)                                     | A native Italian benchmark with diverse tasks formats and multiple prompts.                                                                                                                                                                                                                                                            | Italian                                                                                                                       |
+| [fda](fda/README.md)                                                     | Tasks for extracting key-value pairs from FDA documents to test information extraction.                                                                                                                                                                                                                                                | English                                                                                                                       |
+| [fld](fld/README.md)                                                     | Tasks involving free-form and directed dialogue understanding.                                                                                                                                                                                                                                                                         | English                                                                                                                       |
+| [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French.                                                                                                                                                                                                                                                                  | French                                                                                                                        |
+| [galician_bench](galician_bench/README.md)                               | Collection of tasks in Galician encompassing various evaluation areas.                                                                                                                                                                                                                                                                 | Galician                                                                                                                      |
+| [global_mmlu](global_mmlu/README.md)                                     | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits.                                                                                                                                                                                                           | Multiple (15 languages)                                                                                                       |
+| [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities.                                                                                                                                                                                                                                                  | English                                                                                                                       |
+| [gpqa](gpqa/README.md)                                                   | Tasks designed for general public question answering and knowledge verification.                                                                                                                                                                                                                                                       | English                                                                                                                       |
+| [gsm8k](gsm8k/README.md)                                                 | A benchmark of grade school math problems aimed at evaluating reasoning capabilities.                                                                                                                                                                                                                                                  | English                                                                                                                       |
+| [groundcocoa](groundcocoa/README.md)                                     | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task.                                                                                                                                                                                                                          | English                                                                                                                       |
+| [haerae](haerae/README.md)                                               | Tasks focused on assessing detailed factual and historical knowledge.                                                                                                                                                                                                                                                                  | Korean                                                                                                                        |
+| [headqa](headqa/README.md)                                               | A high-level education-based question answering dataset to test specialized knowledge.                                                                                                                                                                                                                                                 | Spanish, English                                                                                                              |
+| [hellaswag](hellaswag/README.md)                                         | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity.                                                                                                                                                                                                                                             | English                                                                                                                       |
+| [hendrycks_ethics](hendrycks_ethics/README.md)                           | Tasks designed to evaluate the ethical reasoning capabilities of models.                                                                                                                                                                                                                                                               | English                                                                                                                       |
+| [hendrycks_math](hendrycks_math/README.md)                               | Mathematical problem-solving tasks to test numerical reasoning and problem-solving.                                                                                                                                                                                                                                                    | English                                                                                                                       |
+| [histoires_morales](histoires_morales/README.md)                         | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                                    | French (Some MT)                                                                                                              |
+| [hrm8k](hrm8k/README.md)                                                 | A challenging bilingual math reasoning benchmark for Korean and English.                                                                                                                                                                                                                                                               | Korean (Some MT), English (Some MT)                                                                                           |
+| [humaneval](humaneval/README.md)                                         | Code generation task that measure functional correctness for synthesizing programs from docstrings.                                                                                                                                                                                                                                    | Python                                                                                                                        |
+| [ifeval](ifeval/README.md)                                               | Interactive fiction evaluation tasks for narrative understanding and reasoning.                                                                                                                                                                                                                                                        | English                                                                                                                       |
+| [inverse_scaling](inverse_scaling/README.md)                             | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse.                                                                                                                                                                                                            | English                                                                                                                       |
+| [japanese_leaderboard](japanese_leaderboard/README.md)                   | Japanese language understanding tasks to benchmark model performance on various linguistic aspects.                                                                                                                                                                                                                                    | Japanese                                                                                                                      |
+| [jsonschema_bench](jsonschema_bench/README.md)                           | Evaluate the ability of LLMs to generate JSON objects that conform to a given JSON schema, including API, configuration files, and other structured data formats.                                                                                                                                                                      | JSON                                                                                                                          |
+| [kbl](kbl/README.md)                                                     | Korean Benchmark for Legal Language Understanding.                                                                                                                                                                                                                                                                                     | Korean                                                                                                                        |
+| [kmmlu](kmmlu/README.md)                                                 | Knowledge-based multi-subject multiple choice questions for academic evaluation.                                                                                                                                                                                                                                                       | Korean                                                                                                                        |
+| [kobest](kobest/README.md)                                               | A collection of tasks designed to evaluate understanding in Korean language.                                                                                                                                                                                                                                                           | Korean                                                                                                                        |
+| [kormedmcqa](kormedmcqa/README.md)                                       | Medical question answering tasks in Korean to test specialized domain knowledge.                                                                                                                                                                                                                                                       | Korean                                                                                                                        |
+| [lambada](lambada/README.md)                                             | Tasks designed to predict the endings of text passages, testing language prediction skills.                                                                                                                                                                                                                                            | English                                                                                                                       |
+| [lambada_cloze](lambada_cloze/README.md)                                 | Cloze-style LAMBADA dataset.                                                                                                                                                                                                                                                                                                           | English                                                                                                                       |
+| [lambada_multilingual](lambada_multilingual/README.md)                   | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`.                                                                                                                                                                                      | German, English, Spanish, French, Italian                                                                                     |
+| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`.                                                                                                                                                                                         | German, English, Spanish, French, Italian, Dutch, Portuguese                                                                  |
+| [leaderboard](leaderboard/README.md)                                     | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time                                                                                                                                          | English                                                                                                                       |
+| [lingoly](lingoly/README.md)                                             | Challenging logical reasoning benchmark in low-resource languages with controls for memorization                                                                                                                                                                                                                                       | English, Multilingual                                                                                                         |
+| [libra](libra/README.md)                                                 | Evaluates long-context understanding in Russian across four complexity levels                                                                                                                                                                                                                                                          | Russian (MT)                                                                                                               |
+| [logiqa](logiqa/README.md)                                               | Logical reasoning tasks requiring advanced inference and deduction.                                                                                                                                                                                                                                                                    | English, Chinese                                                                                                              |
+| [logiqa2](logiqa2/README.md)                                             | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination.                                                                                                                                                                                                                                              | English, Chinese                                                                                                              |
+| [mastermind](mastermind/README.md)                                       | Reasoning benchmark based on the board game of Mastermind.                                                                                                                                                                                                                                                                             | English                                                                                                                       |
+| [mathqa](mathqa/README.md)                                               | Question answering tasks involving mathematical reasoning and problem-solving.                                                                                                                                                                                                                                                         | English                                                                                                                       |
+| [mbpp](mbpp/README.md)                                                   | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions.                                                                                                                                                                                                                    | Python                                                                                                                        |
+| [meddialog](meddialog/README.md)                                         | Medical open-ended QA and Question Entailment stemming from the MedDialog dataset.                                                                                                                                                                                                                                                     | English                                                                                                                       |
+| [medtext](medtext/README.md)                                             | Medical open-ended QA from the MedText Clinical Notes dataset.                                                                                                                                                                                                                                                                         | English                                                                                                                       |
+| [mimic_repsum](mimic_repsum/README.md)                                   | Medical report summarization from the MIMIC-III dataset.                                                                                                                                                                                                                                                                               | English                                                                                                                       |
+| [mc_taco](mc_taco/README.md)                                             | Question-answer pairs that require temporal commonsense comprehension.                                                                                                                                                                                                                                                                 | English                                                                                                                       |
+| [med_concepts_qa](med_concepts_qa/README.md)                             | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept.                                                                                                                                                                                                                   | English                                                                                                                       |
+| [metabench](metabench/README.md)                                         | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait.                                                                                                                                                                                | English                                                                                                                       |
+| [mediqa_qa2019](mediqa_qa2019/README.md)                                 | Open-ended healthcare question answering benchmark from the MEDIQA 2019 challenge.                                                                                                                                                                                                                                                     | English                                                                                                                       |
+| medmcqa                                                                  | Medical multiple choice questions assessing detailed medical knowledge.                                                                                                                                                                                                                                                                | English                                                                                                                       |
+| medqa                                                                    | Multiple choice question answering based on the United States Medical License Exams.                                                                                                                                                                                                                                                   |                                                                                                                               |
+| [meqsum](meqsum/README.md)                                               | Healtcare Question Entailment benchmark from the MeqSum dataset.                                                                                                                                                                                                                                                                       |                                                                                                                               |
+| [mgsm](mgsm/README.md)                                                   | Benchmark of multilingual grade-school math problems.                                                                                                                                                                                                                                                                                  | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu                                           |
+| [minerva_math](minerva_math/README.md)                                   | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills.                                                                                                                                                                                                                                                    | English                                                                                                                       |
+| [mlqa](mlqa/README.md)                                                   | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance.                                                                                                                                                                                                                         | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese                                                       |
+| [mmlu](mmlu/README.md)                                                   | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported.                                                                                                                                                                                                               | English                                                                                                                       |
+| [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options.                                                                                                                                                                                                | English                                                                                                                       |
+| [mmlu-pro-plus](mmlu-pro-plus/README.md)                                 | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                    | English                                                                                                                       |
+| [mmlu_prox](mmlu_prox/README.md)                                         | A multilingual benchmark that extends MMLU-Pro to multiple typologically diverse languages with human validation.                                                                                                                                                                                                                      | English, Japanese, Chinese, Korean, French, German, Spanish, Portuguese, Swahili, Thai, Arabic, Hindi, Bengali                |
+| [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous.                                                                                                                                                                                                                                                                                        | English                                                                                                                       |
+| model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns.                                                                                                                                                                                                                                                     |                                                                                                                               |
+| [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                      | English                                                                                                                       |
+| [mts_dialog](mts_dialog/README.md)                                       | Open-ended healthcare QA from the MTS-Dialog dataset.                                                                                                                                                                                                                                                                                  | English                                                                                                                       |
+| [multiblimp](multiblimp/README.md)                                       | MultiBLiMP is a (synthetic) multilingual benchmark testing models on linguistic minimal pairs to judge grammatical acceptability                                                                                                                                                                                                       | Multiple (101 languages) - Synthetic                                                                                          |
+| [mutual](mutual/README.md)                                               | A retrieval-based dataset for multi-turn dialogue reasoning.                                                                                                                                                                                                                                                                           | English                                                                                                                       |
+| [noreval](noreval/README.md)                                             | A human-created Norwegian language understanding and generation benchmark.                                                                                                                                                                                                                                                             | Norwegian (Bokmål and Nynorsk)                                                                                                |
+| [nq_open](nq_open/README.md)                                             | Open domain question answering tasks based on the Natural Questions dataset.                                                                                                                                                                                                                                                           | English                                                                                                                       |
+| [okapi/arc_multilingual](okapi/arc_multilingual/README.md)               | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (31 languages) **Machine Translated.**                                                                               |
+| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md)   | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (30 languages) **Machine Translated.**                                                                               |
+| okapi/mmlu_multilingual                                                  | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (34 languages) **Machine Translated.**                                                                               |
+| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (31 languages) **Machine Translated.**                                                                               |
+| [olaph](olaph/README.md)                                                 | Open-ended medical factuality Question Answering from the OLAPH dataset.                                                                                                                                                                                                                                                               | English                                                                                                                       |
+| [openbookqa](openbookqa/README.md)                                       | Open-book question answering tasks that require external knowledge and reasoning.                                                                                                                                                                                                                                                      | English                                                                                                                       |
+| [paloma](paloma/README.md)                                               | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit.                                                                                                                                                 | English                                                                                                                       |
+| [paws-x](paws-x/README.md)                                               | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities.                                                                                                                                                                                                                                                   | English, French, Spanish, German, Chinese, Japanese, Korean                                                                   |
+| [pile](pile/README.md)                                                   | Open source language modelling data set that consists of 22 smaller, high-quality datasets.                                                                                                                                                                                                                                            | English                                                                                                                       |
+| [pile_10k](pile_10k/README.md)                                           | The first 10K elements of The Pile, useful for debugging models trained on it.                                                                                                                                                                                                                                                         | English                                                                                                                       |
+| [piqa](piqa/README.md)                                                   | Physical Interaction Question Answering tasks to test physical commonsense reasoning.                                                                                                                                                                                                                                                  | English                                                                                                                       |
+| [polemo2](polemo2/README.md)                                             | Sentiment analysis and emotion detection tasks based on Polish language data.                                                                                                                                                                                                                                                          | Polish                                                                                                                        |
+| [portuguese_bench](portuguese_bench/README.md)                           | Collection of tasks in European Portuguese encompassing various evaluation areas.                                                                                                                                                                                                                                                      | Portuguese                                                                                                                    |
+| [prost](prost/README.md)                                                 | Tasks requiring understanding of professional standards and ethics in various domains.                                                                                                                                                                                                                                                 | English                                                                                                                       |
+| [pubmedqa](pubmedqa/README.md)                                           | Question answering tasks based on PubMed research articles for biomedical understanding.                                                                                                                                                                                                                                               | English                                                                                                                       |
+| [qa4mre](qa4mre/README.md)                                               | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning.                                                                                                                                                                                                                                              | English                                                                                                                       |
+| [qasper](qasper/README.md)                                               | Question Answering dataset based on academic papers, testing in-depth scientific knowledge.                                                                                                                                                                                                                                            | English                                                                                                                       |
+| [race](race/README.md)                                                   | Reading comprehension assessment tasks based on English exams in China.                                                                                                                                                                                                                                                                | English                                                                                                                       |
+| realtoxicityprompts                                                      | Tasks to evaluate language models for generating text with potential toxicity.                                                                                                                                                                                                                                                         |                                                                                                                               |
+| [ruler](ruler/README.md)                                                 | RULER is a benchmark for testing how well language models handle long pieces of text. Requires custom arg (see readme)                                                                                                                                                                                                                 | English                                                                                                                       |
+| [sciq](sciq/README.md)                                                   | Science Question Answering tasks to assess understanding of scientific concepts.                                                                                                                                                                                                                                                       | English                                                                                                                       |
+| [score](score/README.md)                                                 | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH)                                                                                                                                                                                                                                   | English                                                                                                                       |
+| [scrolls](scrolls/README.md)                                             | Tasks that involve long-form reading comprehension across various domains.                                                                                                                                                                                                                                                             | English                                                                                                                       |
+| [simple_cooccurrence_bias](simple_cooccurrence_bias/README.md)           | A metric that evaluates language models for biases based on stereotypical word associations and co-occurrences in text.                                                                                                                                                                                                                | English                                                                                                                       |
+| [siqa](siqa/README.md)                                                   | Social Interaction Question Answering to evaluate common sense and social reasoning.                                                                                                                                                                                                                                                   | English                                                                                                                       |
+| [spanish_bench](spanish_bench/README.md)                                 | Collection of tasks in Spanish encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Spanish                                                                                                                       |
+| [squad_completion](squad_completion/README.md)                           | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs.                                                                                                                                                                                                                                         | English                                                                                                                       |
+| [squadv2](squadv2/README.md)                                             | Stanford Question Answering Dataset version 2, a reading comprehension benchmark.                                                                                                                                                                                                                                                      | English                                                                                                                       |
+| [storycloze](storycloze/README.md)                                       | Tasks to predict story endings, focusing on narrative logic and coherence.                                                                                                                                                                                                                                                             | English                                                                                                                       |
+| [super_glue](super_glue/README.md)                                       | A suite of challenging tasks designed to test a range of language understanding skills.                                                                                                                                                                                                                                                | English                                                                                                                       |
+| [swag](swag/README.md)                                                   | Situations With Adversarial Generations, predicting the next event in videos.                                                                                                                                                                                                                                                          | English                                                                                                                       |
+| [swde](swde/README.md)                                                   | Information extraction tasks from semi-structured web pages.                                                                                                                                                                                                                                                                           | English                                                                                                                       |
+| [tinyBenchmarks](tinyBenchmarks/README.md)                               | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks.                                                                                                                                                                                                                                     | English                                                                                                                       |
+| [tmmluplus](tmmluplus/README.md)                                         | An extended set of tasks under the TMMLU framework for broader academic assessments.                                                                                                                                                                                                                                                   | Traditional Chinese                                                                                                           |
+| [toxigen](toxigen/README.md)                                             | Tasks designed to evaluate language models on their propensity to generate toxic content.                                                                                                                                                                                                                                              | English                                                                                                                       |
+| [translation](translation/README.md)                                     | Tasks focused on evaluating the language translation capabilities of models.                                                                                                                                                                                                                                                           | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese                               |
+| [triviaqa](triviaqa/README.md)                                           | A large-scale dataset for trivia question answering to test general knowledge.                                                                                                                                                                                                                                                         | English                                                                                                                       |
+| [truthfulqa](truthfulqa/README.md)                                       | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses.                                                                                                                                                                                                                                                | English                                                                                                                       |
+| [truthfulqa-multi](truthfulqa-multi/README.md)                           | Is a multilingual version of TruthfulQA, a QA task aimed at evaluating the truthfulness and factual accuracy of model responses.                                                                                                                                                                                                       | English, Spanish, Catalan, Basque, Galician                                                                                   |
+| [turkishmmlu](turkishmmlu/README.md)                                     | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams.                                                                                                                                                                                                                             | Turkish                                                                                                                       |
+| [unitxt](unitxt/README.md)                                               | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI.                                                                                                                                                                                        | English                                                                                                                       |
+| [unscramble](unscramble/README.md)                                       | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding.                                                                                                                                                                                                                                              | English                                                                                                                       |
+| [webqs](webqs/README.md)                                                 | Web-based question answering tasks designed to evaluate internet search and retrieval.                                                                                                                                                                                                                                                 | English                                                                                                                       |
+| [wikitext](wikitext/README.md)                                           | Tasks based on text from Wikipedia articles to assess language modeling and generation.                                                                                                                                                                                                                                                | English                                                                                                                       |
+| [winogender](winogender/README.md)                                       | A diagnostic dataset that tests for gender bias in coreference resolution by measuring how models associate pronouns with different occupations.                                                                                                                                                                                       | English                                                                                                                       |
+| [winogrande](winogrande/README.md)                                       | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge.                                                                                                                                                                                                                                           | English                                                                                                                       |
+| [wmdp](wmdp/README.md)                                                   | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions.                                                                                                                                                                                                          | English                                                                                                                       |
+| [wmt2016](wmt2016/README.md)                                             | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages.                                                                                                                                                                                                                                               | English, Czech, German, Finnish, Russian, Romanian, Turkish                                                                   |
+| [wsc273](wsc273/README.md)                                               | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution.                                                                                                                                                                                                                                             | English                                                                                                                       |
+| [xcopa](xcopa/README.md)                                                 | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages.                                                                                                                                                                                                                                               | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese                           |
 | [xnli](xnli/README.md)                                                   | Cross-Lingual Natural Language Inference to test understanding across different languages.                                                                                                                                                                                                                                             | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
-| [xnli_eu](xnli_eu/README.md)                                             | Cross-lingual Natural Language Inference tasks in Basque.                                                                                                                                                                                                                                                                              | Basque                                                                                                                |
-| [xquad](xquad/README.md)                                                 | Cross-lingual Question Answering Dataset in multiple languages.                                                                                                                                                                                                                                                                        | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese                 |
-| [xstorycloze](xstorycloze/README.md)                                     | Cross-lingual narrative understanding tasks to predict story endings in multiple languages.                                                                                                                                                                                                                                            | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese                     |
-| [xwinograd](xwinograd/README.md)                                         | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages.                                                                                                                                                                                                                                                  | English, French, Japanese, Portuguese, Russian, Chinese                                                               |
+| [xnli_eu](xnli_eu/README.md)                                             | Cross-lingual Natural Language Inference tasks in Basque.                                                                                                                                                                                                                                                                              | Basque                                                                                                                        |
+| [xquad](xquad/README.md)                                                 | Cross-lingual Question Answering Dataset in multiple languages.                                                                                                                                                                                                                                                                        | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese                         |
+| [xstorycloze](xstorycloze/README.md)                                     | Cross-lingual narrative understanding tasks to predict story endings in multiple languages.                                                                                                                                                                                                                                            | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese                             |
+| [xwinograd](xwinograd/README.md)                                         | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages.                                                                                                                                                                                                                                                  | English, French, Japanese, Portuguese, Russian, Chinese                                                                       |

 ## Multimodal Tasks
 | Task Family                  | Description                                                                                             | Modality    |

--- a/lm_eval/tasks/acpbench/boolq_cot_2shot/act_reach.yaml
+++ b/lm_eval/tasks/acpbench/boolq_cot_2shot/act_reach.yaml
@@ -4,9 +4,9 @@ include: _boolq_cot_2shot_yaml
 fewshot_config:
  sampler: first_n
  samples:
-  - context: 'This is a ferry domain, where the task is to transport cars from their start to their goal locations, using a ferry. Each location is accessible by ferry from each other location. The cars can be debarked or boarded, and the ferry can carry only one car at a time.  There are 2 locations and 5 cars, numbered consecutively.  Currently, the ferry is at l1, with the car c4 on board. The cars are at locations as follows: c0 and c3 are at l1; c1 and c2 are at l0.'
-    question: 'Is it possible to transition to a state where the action "travel by sea from location l0 to location l1" can be applied?'
-    answer: "Let's think step by step.   Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"travel by sea from location l0 to location l1\" hold. Step 2: The following sequence of actions would transition to such a state: sail from location l1 to location l0, unload the car c4 from the ferry to location l0, board car c1 at location l0. **Final Answer**: Yes."
-  - context: 'There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations.  There are 2 trucks and 1 airplane, as well as 4 packages. There are 6 locations across 2 cities.  The locations are in cities as follows: l0-0, l0-1, and l0-2 are in c0; l1-1, l1-2, and l1-0 are in c1.  Currently, a0 is at l1-0, t1 is at l1-1, t0 is at l0-0, p2 and p1 are in t1, p0 and p3 are in a0.'
-    question: 'Is it possible to transition to a state where the action "offload the object p0 from the truck p0 at location p1" can be applied?'
-    answer: "Let's think step by step. Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"offload the object p0 from the truck p0 at location p1\" hold. Step 2: Action preconditions are \"p0 is in p0 and p0 is at p1\". Step 3: These facts are not reachable together, as they include mutually exclusive facts \"p0 is in p0 and p0 is at p1\". **Final Answer**: No."
+    - context: "This is a ferry domain, where the task is to transport cars from their start to their goal locations, using a ferry. Each location is accessible by ferry from each other location. The cars can be debarked or boarded, and the ferry can carry only one car at a time.  There are 2 locations and 5 cars, numbered consecutively.  Currently, the ferry is at l1, with the car c4 on board. The cars are at locations as follows: c0 and c3 are at l1; c1 and c2 are at l0."
+      question: 'Is it possible to transition to a state where the action "travel by sea from location l0 to location l1" can be applied?'
+      answer: "Let's think step by step.   Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"travel by sea from location l0 to location l1\" hold. Step 2: The following sequence of actions would transition to such a state: sail from location l1 to location l0, unload the car c4 from the ferry to location l0, board car c1 at location l0. **Final Answer**: Yes."
+    - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations.  There are 2 trucks and 1 airplane, as well as 4 packages. There are 6 locations across 2 cities.  The locations are in cities as follows: l0-0, l0-1, and l0-2 are in c0; l1-1, l1-2, and l1-0 are in c1.  Currently, a0 is at l1-0, t1 is at l1-1, t0 is at l0-0, p2 and p1 are in t1, p0 and p3 are in a0."
+      question: 'Is it possible to transition to a state where the action "offload the object p0 from the truck p0 at location p1" can be applied?'
+      answer: "Let's think step by step. Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"offload the object p0 from the truck p0 at location p1\" hold. Step 2: Action preconditions are \"p0 is in p0 and p0 is at p1\". Step 3: These facts are not reachable together, as they include mutually exclusive facts \"p0 is in p0 and p0 is at p1\". **Final Answer**: No."
--- a/lm_eval/tasks/afrobench/masakhaner/prompt_1/utils.py
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/utils.py
@@ -67,7 +67,7 @@ def span_f1_agg(items):

        def remove_blank_spaces(text):
            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
-            text = re.sub("\s+", " ", text)
+            text = re.sub(r"\s+", " ", text)
            return text

        def remove_punctuation(text):

--- a/lm_eval/tasks/afrobench/masakhaner/prompt_2/utils.py
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/utils.py
@@ -67,7 +67,7 @@ def span_f1_agg(items):

        def remove_blank_spaces(text):
            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
-            text = re.sub("\s+", " ", text)
+            text = re.sub(r"\s+", " ", text)
            return text

        def remove_punctuation(text):

--- a/lm_eval/tasks/afrobench/masakhaner/prompt_3/utils.py
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/utils.py
@@ -67,7 +67,7 @@ def span_f1_agg(items):

        def remove_blank_spaces(text):
            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
-            text = re.sub("\s+", " ", text)
+            text = re.sub(r"\s+", " ", text)
            return text

        def remove_punctuation(text):

--- a/lm_eval/tasks/afrobench/masakhaner/prompt_4/utils.py
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/utils.py
@@ -67,7 +67,7 @@ def span_f1_agg(items):

        def remove_blank_spaces(text):
            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
-            text = re.sub("\s+", " ", text)
+            text = re.sub(r"\s+", " ", text)
            return text

        def remove_punctuation(text):

--- a/lm_eval/tasks/afrobench/masakhaner/prompt_5/utils.py
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/utils.py
@@ -67,7 +67,7 @@ def span_f1_agg(items):

        def remove_blank_spaces(text):
            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
-            text = re.sub("\s+", " ", text)
+            text = re.sub(r"\s+", " ", text)
            return text

        def remove_punctuation(text):