Merge branch 'main' into metrics

# Conflicts: # .pre-commit-config.yaml # lm_eval/api/task.py # lm_eval/models/huggingface.py # lm_eval/models/vllm_causallms.py # pyproject.toml

Merge branch 'main' into metrics
# Conflicts: # .pre-commit-config.yaml # lm_eval/api/task.py # lm_eval/models/huggingface.py # lm_eval/models/vllm_causallms.py # pyproject.toml
e6b798f9 · Baber · 14a29ade · 4f8195f1 · e6b798f9 · e6b798f9
Commit e6b798f9 authored Jul 25, 2025 by Baber
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,7 +34,6 @@ repos:
      # Run the linter.
      - id: ruff-check
        args: [ --fix]
-        # Run the formatter.
      - id: ruff-format
  - repo: https://github.com/codespell-project/codespell
    rev: v2.4.1
@@ -42,8 +41,10 @@ repos:
      - id: codespell
        exclude: >
          (?x)^(
+
              .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb
          )$
+
        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
  - repo: https://github.com/jackdewinter/pymarkdown
    rev: v0.9.30
@@ -51,9 +52,3 @@ repos:
      - id: pymarkdown
        exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$
        args: [fix, -r]
-#  - repo: https://github.com/pre-commit/mirrors-mypy
-#    rev: v1.5.1
-#    hooks:
-#    - id: mypy
-#      additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
-#      exclude: ^tests/.*$
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -17,7 +17,7 @@ def try_parse_json(value: str) -> Union[str, dict, None]:
        if "{" in value:
            raise argparse.ArgumentTypeError(
                f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings."
-            )
+            ) from None
        return value


@@ -30,8 +30,8 @@ def _int_or_none_list_arg_type(
            return None
        try:
            return int(item)
-        except ValueError:
-            raise argparse.ArgumentTypeError(f"{item} is not an integer or None")
+        except ValueError as e:
+            raise argparse.ArgumentTypeError(f"{item} is not an integer or None") from e

    items = [parse_value(v) for v in value.split(split_char)]
    num_items = len(items)
@@ -433,8 +433,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        # because it's already been determined based on the prior env var before launching our
        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
        import datasets
+        from packaging.version import parse as vparse

-        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+        if vparse(datasets.__version__) < vparse("4.0.0"):
+            datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True

        if isinstance(args.model_args, dict):
            args.model_args["trust_remote_code"] = True
@@ -510,7 +512,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        )

        if args.log_samples:
-            for task_name, config in results["configs"].items():
+            for task_name, _config in results["configs"].items():
                evaluation_tracker.save_results_samples(
                    task_name=task_name, samples=samples[task_name]
                )

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -663,6 +663,11 @@ class ConfigurableTask(Task):
        print("hello")

    def download(self, dataset_kwargs: dict[str, Any] | None = None, **kwargs) -> None:
+        from packaging.version import parse as vparse
+
+        if dataset_kwargs and vparse(datasets.__version__) >= vparse("4.0.0"):
+            dataset_kwargs.pop("trust_remote_code", None)
+
        self.config.dataset_kwargs, self.config.metadata = (
            self.config.dataset_kwargs or {},
            self.config.metadata or {},

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -7,7 +7,7 @@ import os
 import random
 import time
 from collections import defaultdict
-from typing import TYPE_CHECKING, Any, List, Optional, Union
+from typing import TYPE_CHECKING, Any

 import numpy as np
 import torch
@@ -37,6 +37,7 @@ from lm_eval.utils import (
    positional_deprecated,
    setup_logging,
    simple_parse_args_string,
+    wrap_text,
 )


@@ -50,28 +51,28 @@ eval_logger = logging.getLogger(__name__)
 @positional_deprecated
 def simple_evaluate(
    model,
-    model_args: Optional[Union[str, dict[str, Any]]] = None,
-    tasks: Optional[List[Union[str, dict, object]]] = None,
-    num_fewshot: Optional[int] = None,
-    batch_size: Optional[Union[int, str]] = None,
-    max_batch_size: Optional[int] = None,
-    device: Optional[str] = None,
-    use_cache: Optional[str] = None,
+    model_args: str | dict[str, Any] | None = None,
+    tasks: list[str | dict | object] | None = None,
+    num_fewshot: int | None = None,
+    batch_size: int | str | None = None,
+    max_batch_size: int | None = None,
+    device: str | None = None,
+    use_cache: str | None = None,
    cache_requests: bool = False,
    rewrite_requests_cache: bool = False,
    delete_requests_cache: bool = False,
-    limit: Optional[Union[int, float]] = None,
-    samples: Optional[dict] = None,
+    limit: int | float | None = None,
+    samples: dict | None = None,
    bootstrap_iters: int = 100000,
    check_integrity: bool = False,
    write_out: bool = False,
    log_samples: bool = True,
-    evaluation_tracker: Optional[EvaluationTracker] = None,
-    system_instruction: Optional[str] = None,
-    apply_chat_template: Union[bool, str] = False,
+    evaluation_tracker: EvaluationTracker | None = None,
+    system_instruction: str | None = None,
+    apply_chat_template: bool | str = False,
    fewshot_as_multiturn: bool = False,
-    gen_kwargs: Union[str, dict, None] = None,
-    task_manager: Optional[TaskManager] = None,
+    gen_kwargs: str | dict | None = None,
+    task_manager: TaskManager | None = None,
    verbosity=None,
    predict_only: bool = False,
    random_seed: int = 0,
@@ -79,7 +80,7 @@ def simple_evaluate(
    torch_random_seed: int = 1234,
    fewshot_random_seed: int = 1234,
    confirm_run_unsafe_code: bool = False,
-    metadata: Optional[dict] = None,
+    metadata: dict | None = None,
 ):
    """Instantiate and evaluate a model on a list of tasks.

@@ -171,8 +172,11 @@ def simple_evaluate(
        )
    ) and not apply_chat_template:
        eval_logger.warning(
-            "Model appears to be an instruct or chat variant but chat template is not applied. "
-            "Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
+            wrap_text(
+                f"""pretrained={model_args.get("pretrained") if isinstance(model_args, dict) else model_args} appears to be an
+                instruct or chat variant but chat template is not applied.
+                Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`).""",
+            )
        )

    if delete_requests_cache:
@@ -236,7 +240,9 @@ def simple_evaluate(

        else:
            eval_logger.info(
-                f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
+                wrap_text(
+                    f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
+                )
            )
            lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
                model_args,
@@ -283,7 +289,7 @@ def simple_evaluate(

    # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
    # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
-    def _adjust_config(task_dict: dict[str, "Task"]) -> dict[str, "Task"]:
+    def _adjust_config(task_dict: dict[str, Task]) -> dict[str, Task]:
        adjusted_task_dict = {}
        for task_name, task_obj in task_dict.items():
            if isinstance(task_obj, dict):
@@ -414,17 +420,17 @@ def simple_evaluate(

 @positional_deprecated
 def evaluate(
-    lm: "LM",
+    lm: LM,
    task_dict,
    limit: int | float | None = None,
-    samples: Optional[dict] = None,
+    samples: dict | None = None,
    cache_requests: bool = False,
    rewrite_requests_cache: bool = False,
-    bootstrap_iters: Optional[int] = 100000,
+    bootstrap_iters: int | None = 100000,
    write_out: bool = False,
    log_samples: bool = True,
-    system_instruction: Optional[str] = None,
-    apply_chat_template: Union[bool, str] = False,
+    system_instruction: str | None = None,
+    apply_chat_template: bool | str = False,
    fewshot_as_multiturn: bool = False,
    verbosity: str = "INFO",
    confirm_run_unsafe_code: bool = False,
@@ -484,12 +490,11 @@ def evaluate(

    # get lists of group hierarchy and each type of request
    eval_tasks = get_task_list(task_dict)
-    if not log_samples:
-        if not all(
-            "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
-            for task_output in eval_tasks
-        ):
-            raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
+    if not log_samples and not all(
+        "bypass" not in getattr(task_output.task, "_metric_fn_list", {})
+        for task_output in eval_tasks
+    ):
+        raise ValueError("log_samples must be True for 'bypass' metric-only tasks")

    # validation checks:
    # 1.are we running multimodal task <-> non-multimodal model class, or vice-versa.
@@ -504,11 +509,10 @@ def evaluate(
            raise ValueError(
                f"Attempted to run task: {task_output.task_name} which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task."
            )
-    if len(incompatible_tasks) > 0:
-        if not getattr(lm, "MULTIMODAL", False):
-            raise ValueError(
-                f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
-            )
+    if len(incompatible_tasks) > 0 and not getattr(lm, "MULTIMODAL", False):
+        raise ValueError(
+            f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
+        )
    # end validation check

    # Cache the limit arg.
@@ -531,9 +535,7 @@ def evaluate(
            system_instruction=system_instruction,
            apply_chat_template=bool(apply_chat_template),
            fewshot_as_multiturn=fewshot_as_multiturn,
-            chat_template=getattr(lm, "apply_chat_template")
-            if apply_chat_template
-            else None,
+            chat_template=getattr(lm, "apply_chat_template", None),
            tokenizer_name=getattr(lm, "tokenizer_name", "")
            if apply_chat_template
            else "",
@@ -606,7 +608,7 @@ def evaluate(
        for instances in instances_by_doc_id.values():
            instances.sort(key=lambda x: x.idx)
        # iterate over different filters used
-        for filter_key in task.instances[0].filtered_resps.keys():
+        for filter_key in task.instances[0].filtered_resps:
            indices = (
                samples.get(task_output.task_name, None)
                if samples is not None
@@ -619,10 +621,7 @@ def evaluate(
                samples=indices,
            )
            for doc_id, doc in doc_iterator:
-                if indices:
-                    doc_id_true = indices[doc_id]
-                else:
-                    doc_id_true = doc_id
+                doc_id_true = indices[doc_id] if indices else doc_id
                requests = instances_by_doc_id[doc_id]
                metrics = task.process_results(
                    doc, [req.filtered_resps[filter_key] for req in requests]
@@ -720,7 +719,7 @@ def evaluate(
            ):  # subtask list will list "task_name": [] for solo tasks
                for task in task_list:
                    for m, h in higher_is_better[task].items():
-                        if m not in _higher_is_better.keys():
+                        if m not in _higher_is_better:
                            _higher_is_better[m] = h

                        if (

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -3,9 +3,10 @@ from __future__ import annotations
 import copy
 import logging
 import os
+from collections.abc import Iterator, Sequence
 from datetime import timedelta
 from pathlib import Path
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING, Any, Literal

 import jinja2
 import torch
@@ -19,6 +20,7 @@ from accelerate import (
 from accelerate.utils import get_max_memory
 from huggingface_hub import HfApi
 from packaging import version
+from packaging.version import parse as vparse
 from tqdm import tqdm
 from transformers.models.auto.modeling_auto import (
    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@@ -26,7 +28,6 @@ from transformers.models.auto.modeling_auto import (
 )

 from lm_eval import utils
-from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
 from lm_eval.api.registry import register_model
 from lm_eval.models.utils import (
@@ -44,13 +45,15 @@ from lm_eval.models.utils import (
 if TYPE_CHECKING:
    from transformers.quantizers.auto import AutoQuantizationConfig

+    from lm_eval.api.instance import Instance
+
 eval_logger = logging.getLogger(__name__)
+TOKENIZER_INFINITY = 1000000000000000019884624838656


 @register_model("hf-auto", "hf", "huggingface")
 class HFLM(TemplateLM):
-    """
-    An abstracted Huggingface model class. Enables usage with both models of
+    """An abstracted Huggingface model class. Enables usage with both models of
    `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.

    Supports data-parallel multi-GPU with HF Accelerate.
@@ -98,6 +101,8 @@ class HFLM(TemplateLM):
        # end token for thinking, either the string or int token id.
        # splits to get response after this token (if provided).
        think_end_token: str | int | None = None,
+        enable_thinking: bool | None = None,
+        chat_template_args: dict[str, Any] | None = None,
        **kwargs,
    ) -> None:
        super().__init__()
@@ -237,6 +242,11 @@ class HFLM(TemplateLM):
        self.vocab_size = self.tokenizer.vocab_size
        # select (or create) a pad token to use
        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
+        self.chat_template_args = (
+            chat_template_args or {} | dict(enable_thinking=enable_thinking)
+            if enable_thinking is not None
+            else {}
+        )

        self.add_bos_token = add_bos_token
        if "gemma" in getattr(self.config, "model_type", ""):
@@ -370,13 +380,8 @@ class HFLM(TemplateLM):
                }
            else:  # Estimating the possible memory requirements
                max_memory_all_gpus = get_max_memory()
-                if "cpu" in max_memory_all_gpus:
-                    del max_memory_all_gpus["cpu"]
-                if not hasattr(self, "accelerator"):
-                    max_memory_per_gpu_map = {
-                        k: v for k, v in max_memory_all_gpus.items()
-                    }
-                else:
+                max_memory_all_gpus.pop("cpu", None)
+                if hasattr(self, "accelerator"):
                    # use only 1 / num_processes of the GPUs if we are running under accelerate launch
                    max_memory_per_gpu_map = {
                        k: v
@@ -384,6 +389,9 @@ class HFLM(TemplateLM):
                        if k % num_local_processes
                        == (self.accelerator.process_index % num_local_processes)
                    }
+                else:
+                    max_memory_per_gpu_map = max_memory_all_gpus
+
            args["max_memory"] = max_memory_per_gpu_map
            args["device_map"] = "auto" if device_map is None else device_map
            eval_logger.info(
@@ -427,12 +435,12 @@ class HFLM(TemplateLM):
            return self._model

    @property
-    def eot_token_id(self):
+    def eot_token_id(self) -> int:
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id

    @property
-    def prefix_token_id(self):
+    def prefix_token_id(self) -> int:
        # it is used as prefix for loglikelihood
        if self.custom_prefix_token_id is not None:
            return self.custom_prefix_token_id
@@ -441,7 +449,7 @@ class HFLM(TemplateLM):
        return self.tokenizer.eos_token_id

    @property
-    def max_length(self):
+    def max_length(self) -> int:
        if self._max_length:  # if max length manually set, return it
            return self._max_length
        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
@@ -449,7 +457,7 @@ class HFLM(TemplateLM):
            if hasattr(self.model.config, attr):
                return getattr(self.model.config, attr)
        if hasattr(self.tokenizer, "model_max_length"):
-            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+            if self.tokenizer.model_max_length == TOKENIZER_INFINITY:
                return self._DEFAULT_MAX_LENGTH
            return self.tokenizer.model_max_length
        return self._DEFAULT_MAX_LENGTH
@@ -484,8 +492,8 @@ class HFLM(TemplateLM):
        backend: Literal["default", "causal", "seq2seq"] = "default",
        trust_remote_code: bool | None = False,
    ) -> None:
-        """
-        Helper method during initialization.
+        """Helper method during initialization.
+
        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
        sets `self.AUTO_MODEL_CLASS` appropriately if not already set.

@@ -504,13 +512,18 @@ class HFLM(TemplateLM):
            )
        else:
            # determine and use the default HF backend for this model, based on its config + metadata.
-            if self.config.model_type in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
+            if (
+                getattr(config, "model_type", None)
+                in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+            ):
                # first check if model type is listed under seq2seq models, since some
                # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
                # these special cases should be treated as seq2seq models.
                self.backend = "seq2seq"
                eval_logger.debug(f"Using model type '{self.backend}'")
-            elif self.config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
+            elif (
+                getattr(config, "model_type", None) in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+            ):
                self.backend = "causal"
                eval_logger.debug(f"Using model type '{self.backend}'")
            else:
@@ -541,7 +554,7 @@ class HFLM(TemplateLM):
        gguf_file: str | None = None,
        subfolder: str = "",
    ) -> None:
-        """Return the model config for HuggingFace models"""
+        """Return the model config for HuggingFace models."""
        self._config = transformers.AutoConfig.from_pretrained(
            pretrained,
            revision=revision,
@@ -574,8 +587,7 @@ class HFLM(TemplateLM):
        subfolder: str = "",
        **kwargs,
    ) -> None:
-        """
-        Initializes an HF or HF-compatible PreTrainedModel from scratch
+        """Initializes an HF or HF-compatible PreTrainedModel from scratch
        inside HFLM, using the kwargs passed into self.__init__().

        Also handles functionality such as AutoGPTQ usage and PEFT wrapping.
@@ -586,7 +598,7 @@ class HFLM(TemplateLM):
        please consider subclassing HFLM and overriding this and other methods as needed.
        """

-        model_kwargs = kwargs if kwargs else {}
+        model_kwargs = kwargs or {}

        model_kwargs.update(
            self._get_accelerate_args(
@@ -600,15 +612,12 @@ class HFLM(TemplateLM):
        )

        if not autogptq and not gptqmodel:
-            if model_kwargs.get("load_in_4bit", None):
-                assert transformers.__version__ >= "4.30.0", (
+            if model_kwargs.get("load_in_4bit"):
+                assert vparse(transformers.__version__) >= vparse("4.30.0"), (
                    "load_in_4bit requires transformers >= 4.30.0"
                )
-            if transformers.__version__ >= "4.30.0" and (
-                model_kwargs.get("load_in_4bit")
-                and (compute_dtype := model_kwargs.get("bnb_4bit_compute_dtype"))
-            ):
-                model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(compute_dtype)
+                if compute_dtype := model_kwargs.get("bnb_4bit_compute_dtype"):
+                    model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(compute_dtype)

            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
                pretrained,
@@ -666,9 +675,9 @@ class HFLM(TemplateLM):
        if peft:
            from peft import PeftModel, __version__ as PEFT_VERSION

-            if model_kwargs.get("load_in_4bit") and version.parse(
-                PEFT_VERSION
-            ) < version.parse("0.4.0"):
+            if model_kwargs.get("load_in_4bit") and vparse(PEFT_VERSION) < vparse(
+                "0.4.0"
+            ):
                raise AssertionError("load_in_4bit requires peft >= 0.4.0")
            if self._model.config.vocab_size != len(self.tokenizer):
                # resize model for LoRAs with added tokens
@@ -694,10 +703,10 @@ class HFLM(TemplateLM):
            for name, param in self._model.state_dict().items():
                try:
                    param.data += _model_delta.state_dict()[name]
-                except KeyError:
+                except KeyError as e:
                    raise KeyError(
                        f"Delta model is missing weights for layer: {name}"
-                    ) from None
+                    ) from e
                except Exception as e:
                    raise RuntimeError(
                        f"Failed to add delta weights to layer {name}. Error: {e}"
@@ -705,8 +714,6 @@ class HFLM(TemplateLM):

            del _model_delta

-        return None
-
    def _create_tokenizer(
        self,
        pretrained: str | transformers.PreTrainedModel,
@@ -721,8 +728,7 @@ class HFLM(TemplateLM):
        add_bos_token: bool | None = False,
        subfolder: str | None = "",
    ) -> None:
-        """
-        Helper method during initialization.
+        """Helper method during initialization.

        Create a tokenizer object corresponding to the correct
        tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
@@ -768,9 +774,8 @@ class HFLM(TemplateLM):
            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
                model_name, **kwargs
            )
-        return None

-    def _detect_batch_size(self, requests=None, pos: int = 0):
+    def _detect_batch_size(self, requests: Sequence | None = None, pos: int = 0):
        if requests:
            _, context_enc, continuation_enc = requests[pos]
            max_length = len(
@@ -785,7 +790,7 @@ class HFLM(TemplateLM):

        # if OOM, then halves batch_size and tries again
        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
-        def forward_batch(batch_size):
+        def forward_batch(batch_size: int):
            if self.backend == "seq2seq":
                length = max(max_context_enc, max_cont_enc)
                batched_conts = torch.ones(
@@ -832,7 +837,10 @@ class HFLM(TemplateLM):
        return batch_size

    def tok_encode(
-        self, string: str, left_truncate_len=None, add_special_tokens=None
+        self,
+        string: str,
+        left_truncate_len: int | None = None,
+        add_special_tokens: bool | None = None,
    ) -> list[int]:
        """ """
        # default for None - empty dict, use predefined tokenizer param
@@ -861,7 +869,7 @@ class HFLM(TemplateLM):
        self,
        strings: list[str],
        padding_side: str = "left",
-        left_truncate_len: int = None,
+        left_truncate_len: int | None = None,
        truncation: bool = False,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
@@ -882,7 +890,7 @@ class HFLM(TemplateLM):
        if left_truncate_len:
            original_lengths = encoding["input_ids"].size(1)
            if original_lengths > left_truncate_len:
-                eval_logger.warn(
+                eval_logger.warning(
                    f"Left truncation applied. Original sequence length was {original_lengths}, "
                    f"truncating to last {left_truncate_len} tokens. Some content will be lost.",
                )
@@ -894,11 +902,17 @@ class HFLM(TemplateLM):

        return encoding["input_ids"], encoding["attention_mask"]

-    def tok_decode(self, tokens, skip_special_tokens=True):
+    def tok_decode(self, tokens: Iterator[list[str]], skip_special_tokens: bool = True):
        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)

-    def _model_call(self, inps, attn_mask=None, labels=None):
+    def _model_call(
+        self,
+        inps: torch.Tensor,
+        attn_mask: torch.Tensor | None = None,
+        labels: torch.Tensor | None = None,
+    ) -> torch.Tensor:
        """
+
        :param inps: torch.Tensor
            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
            [batch, sequence_ctx]. the size of sequence may vary from call to call
@@ -926,14 +940,20 @@ class HFLM(TemplateLM):
                return self.model(
                    input_ids=inps, attention_mask=attn_mask, labels=labels
                ).logits
-            else:
+
                assert self.AUTO_MODEL_CLASS in (
                    transformers.AutoModelForCausalLM,
                    transformers.AutoModelForVision2Seq,
                )
                return self.model(inps).logits

-    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+    def _model_generate(
+        self,
+        context,
+        max_length: int,
+        stop: list[str],
+        **generation_kwargs: dict[str, Any],
+    ) -> torch.Tensor:
        # temperature = 0.0 if not set
        # if do_sample is false and temp==0.0:
        # remove temperature, as do_sample=False takes care of this
@@ -966,7 +986,10 @@ class HFLM(TemplateLM):
            )

    def _select_cont_toks(
-        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
+        self,
+        logits: torch.Tensor,
+        contlen: int | None = None,
+        inplen: int | None = None,
    ) -> torch.Tensor:
        if self.backend == "causal":
            assert contlen and inplen, (
@@ -1092,13 +1115,13 @@ class HFLM(TemplateLM):
        self,
        requests: list[tuple[tuple[str, str], list[int], list[int]]],
        disable_tqdm: bool = False,
-        override_bs: int = None,
+        override_bs: int | None = None,
    ) -> list[tuple[float, bool]]:
        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
        res = []

        def _collate(req: tuple[tuple[str, str], list[int], list[int]]):
-            """Defines the key for the sorted method"""
+            """Defines the key for the sorted method."""
            # the negative sign on len(toks) sorts descending - this has a few advantages:
            # - time estimates will always be over not underestimates, which is more useful for planning
            # - to know the size of a batch when going through the list, you know the first one is always the batch
@@ -1110,7 +1133,7 @@ class HFLM(TemplateLM):
            return -len(toks), tuple(toks)

        def _lookup_one_token_cont(req: tuple[tuple[str, str], list[int], list[int]]):
-            """Defines the key to group and lookup one-token continuations"""
+            """Defines the key to group and lookup one-token continuations."""
            # Use with group_by="contexts" (optional)"
            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
            # speeds up some multiple-choice tasks proportionally to the number of choices.
@@ -1388,7 +1411,7 @@ class HFLM(TemplateLM):
                # add EOS token to stop sequences
                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
            else:
-                raise ValueError(
+                raise TypeError(
                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                )
            if "max_gen_toks" in kwargs:
@@ -1471,15 +1494,14 @@ class HFLM(TemplateLM):
    def apply_chat_template(
        self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True
    ) -> str:
-        """
-        Method to apply a chat template to a list of chat history between user and model.
-        """
+        """Method to apply a chat template to a list of chat history between user and model."""
        try:
            chat_templated = self.tokenizer.apply_chat_template(
                chat_history,
                tokenize=False,
                add_generation_prompt=add_generation_prompt,
                continue_final_message=not add_generation_prompt,
+                **self.chat_template_args,
            )
        except jinja2.exceptions.TemplateError:
            eval_logger.warning(
@@ -1491,14 +1513,13 @@ class HFLM(TemplateLM):
                tokenize=False,
                add_generation_prompt=add_generation_prompt,
                continue_final_message=not add_generation_prompt,
+                **self.chat_template_args,
            )

        return chat_templated

    def get_model_info(self) -> dict:
-        """
-        Method to get Hugging Face model information for experiment reproducibility.
-        """
+        """Method to get Hugging Face model information for experiment reproducibility."""

        def get_model_num_params(model) -> int:
            if hasattr(model, "num_parameters"):

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -133,11 +133,11 @@ class VLLM(TemplateLM):
        max_model_len: int | None = None,
        seed: int = 1234,
        gpu_memory_utilization: float = 0.9,
-        device: str = "cuda",
        data_parallel_size: int = 1,
        lora_local_path: str | None = None,
        # VLLM: enable thinking tags in the prompt.
        enable_thinking: bool = True,
+        chat_template_args: dict | None = None,
        # End marker for thinking tags - splits to get response after this token (if provided).
        think_end_token: str | None = None,
        max_lora_rank: int = 16,
@@ -154,6 +154,7 @@ class VLLM(TemplateLM):
        assert max_length is None or max_model_len is None, (
            "Either max_length or max_model_len may be provided, but not both"
        )
+        kwargs.pop("device", None)
        self.think_end_token = think_end_token
        self.V1 = os.environ.get("VLLM_USE_V1", "1") != "0"
        self._max_length = max_model_len if max_model_len is not None else max_length
@@ -174,7 +175,6 @@ class VLLM(TemplateLM):
            "swap_space": int(swap_space),
            "quantization": quantization,
            "seed": int(seed),
-            "device": str(device),
            "enable_lora": bool(lora_local_path),
            "max_lora_rank": int(max_lora_rank),
        }
@@ -211,7 +211,10 @@ class VLLM(TemplateLM):
            add_bos_token=add_bos_token,
        )
        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self._config)
-        self.enable_thinking = enable_thinking
+        self.chat_template_args = chat_template_args or {}
+        self.enable_thinking = self.chat_template_args.pop(
+            "enable_thinking", enable_thinking
+        )
        self.add_bos_token = add_bos_token
        if "gemma" in pretrained.lower():
            self.add_bos_token = True
@@ -319,6 +322,7 @@ class VLLM(TemplateLM):
                continue_final_message=not add_generation_prompt,
                chat_template=self.hf_chat_template,
                enable_thinking=self.enable_thinking,
+                **self.chat_template_args,
            )
        except jinja2.exceptions.TemplateError:
            eval_logger.warning(
@@ -331,6 +335,7 @@ class VLLM(TemplateLM):
                continue_final_message=not add_generation_prompt,
                chat_template=self.hf_chat_template,
                enable_thinking=self.enable_thinking,
+                **self.chat_template_args,
            )

        return chat_templated

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -85,6 +85,7 @@
 | [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`.                                                                                                                                                                                         | German, English, Spanish, French, Italian, Dutch, Portuguese                                                                  |
 | [leaderboard](leaderboard/README.md)                                     | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time                                                                                                                                          | English                                                                                                                       |
 | [lingoly](lingoly/README.md)                                             | Challenging logical reasoning benchmark in low-resource languages with controls for memorization                                                                                                                                                                                                                                       | English, Multilingual                                                                                                         |
+| [libra](libra/README.md)                                                 | Evaluates long-context understanding in Russian across four complexity levels                                                                                                                                                                                                                                                          | Russian (MT)                                                                                                               |
 | [logiqa](logiqa/README.md)                                               | Logical reasoning tasks requiring advanced inference and deduction.                                                                                                                                                                                                                                                                    | English, Chinese                                                                                                              |
 | [logiqa2](logiqa2/README.md)                                             | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination.                                                                                                                                                                                                                                              | English, Chinese                                                                                                              |
 | [mastermind](mastermind/README.md)                                       | Reasoning benchmark based on the board game of Mastermind.                                                                                                                                                                                                                                                                             | English                                                                                                                       |

--- a/lm_eval/tasks/acpbench/boolq_cot_2shot/act_reach.yaml
+++ b/lm_eval/tasks/acpbench/boolq_cot_2shot/act_reach.yaml
@@ -4,9 +4,9 @@ include: _boolq_cot_2shot_yaml
 fewshot_config:
  sampler: first_n
  samples:
-  - context: 'This is a ferry domain, where the task is to transport cars from their start to their goal locations, using a ferry. Each location is accessible by ferry from each other location. The cars can be debarked or boarded, and the ferry can carry only one car at a time.  There are 2 locations and 5 cars, numbered consecutively.  Currently, the ferry is at l1, with the car c4 on board. The cars are at locations as follows: c0 and c3 are at l1; c1 and c2 are at l0.'
-    question: 'Is it possible to transition to a state where the action "travel by sea from location l0 to location l1" can be applied?'
-    answer: "Let's think step by step.   Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"travel by sea from location l0 to location l1\" hold. Step 2: The following sequence of actions would transition to such a state: sail from location l1 to location l0, unload the car c4 from the ferry to location l0, board car c1 at location l0. **Final Answer**: Yes."
-  - context: 'There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations.  There are 2 trucks and 1 airplane, as well as 4 packages. There are 6 locations across 2 cities.  The locations are in cities as follows: l0-0, l0-1, and l0-2 are in c0; l1-1, l1-2, and l1-0 are in c1.  Currently, a0 is at l1-0, t1 is at l1-1, t0 is at l0-0, p2 and p1 are in t1, p0 and p3 are in a0.'
-    question: 'Is it possible to transition to a state where the action "offload the object p0 from the truck p0 at location p1" can be applied?'
-    answer: "Let's think step by step. Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"offload the object p0 from the truck p0 at location p1\" hold. Step 2: Action preconditions are \"p0 is in p0 and p0 is at p1\". Step 3: These facts are not reachable together, as they include mutually exclusive facts \"p0 is in p0 and p0 is at p1\". **Final Answer**: No."
+    - context: "This is a ferry domain, where the task is to transport cars from their start to their goal locations, using a ferry. Each location is accessible by ferry from each other location. The cars can be debarked or boarded, and the ferry can carry only one car at a time.  There are 2 locations and 5 cars, numbered consecutively.  Currently, the ferry is at l1, with the car c4 on board. The cars are at locations as follows: c0 and c3 are at l1; c1 and c2 are at l0."
+      question: 'Is it possible to transition to a state where the action "travel by sea from location l0 to location l1" can be applied?'
+      answer: "Let's think step by step.   Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"travel by sea from location l0 to location l1\" hold. Step 2: The following sequence of actions would transition to such a state: sail from location l1 to location l0, unload the car c4 from the ferry to location l0, board car c1 at location l0. **Final Answer**: Yes."
+    - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations.  There are 2 trucks and 1 airplane, as well as 4 packages. There are 6 locations across 2 cities.  The locations are in cities as follows: l0-0, l0-1, and l0-2 are in c0; l1-1, l1-2, and l1-0 are in c1.  Currently, a0 is at l1-0, t1 is at l1-1, t0 is at l0-0, p2 and p1 are in t1, p0 and p3 are in a0."
+      question: 'Is it possible to transition to a state where the action "offload the object p0 from the truck p0 at location p1" can be applied?'
+      answer: "Let's think step by step. Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"offload the object p0 from the truck p0 at location p1\" hold. Step 2: Action preconditions are \"p0 is in p0 and p0 is at p1\". Step 3: These facts are not reachable together, as they include mutually exclusive facts \"p0 is in p0 and p0 is at p1\". **Final Answer**: No."
--- a/lm_eval/tasks/afrobench/masakhaner/prompt_1/utils.py
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/utils.py
@@ -67,7 +67,7 @@ def span_f1_agg(items):

        def remove_blank_spaces(text):
            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
-            text = re.sub("\s+", " ", text)
+            text = re.sub(r"\s+", " ", text)
            return text

        def remove_punctuation(text):

--- a/lm_eval/tasks/afrobench/masakhaner/prompt_2/utils.py
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/utils.py
@@ -67,7 +67,7 @@ def span_f1_agg(items):

        def remove_blank_spaces(text):
            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
-            text = re.sub("\s+", " ", text)
+            text = re.sub(r"\s+", " ", text)
            return text

        def remove_punctuation(text):

--- a/lm_eval/tasks/afrobench/masakhaner/prompt_3/utils.py
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/utils.py
@@ -67,7 +67,7 @@ def span_f1_agg(items):

        def remove_blank_spaces(text):
            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
-            text = re.sub("\s+", " ", text)
+            text = re.sub(r"\s+", " ", text)
            return text

        def remove_punctuation(text):

--- a/lm_eval/tasks/afrobench/masakhaner/prompt_4/utils.py
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/utils.py
@@ -67,7 +67,7 @@ def span_f1_agg(items):

        def remove_blank_spaces(text):
            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
-            text = re.sub("\s+", " ", text)
+            text = re.sub(r"\s+", " ", text)
            return text

        def remove_punctuation(text):

--- a/lm_eval/tasks/afrobench/masakhaner/prompt_5/utils.py
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/utils.py
@@ -67,7 +67,7 @@ def span_f1_agg(items):

        def remove_blank_spaces(text):
            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
-            text = re.sub("\s+", " ", text)
+            text = re.sub(r"\s+", " ", text)
            return text

        def remove_punctuation(text):

--- a/lm_eval/tasks/afrobench/masakhanews/utils.py
+++ b/lm_eval/tasks/afrobench/masakhanews/utils.py
@@ -12,9 +12,9 @@ def prompt_func(mode, lang):
        "prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
        f"Given the categories technology, religion, politics, sports, health, entertainment, or business; what is "
        f"the topic of the {lang} statement below? Return only the category. "
-        "\n\ntext: {{headline}} \category:\n\n",
+        "\n\ntext: {{headline}} \\category:\n\n",
        "prompt_4": "Label the following text as technology, religion, politics, sports, health, entertainment, or geography. Provide only the category as your "
-        "response. \n\ntext: {{headline}} \category: \n\n",
+        "response. \n\ntext: {{headline}} \\category: \n\n",
        "prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
        f"For each input, classify the topic as technology, business, politics, sports, health, entertainment, or religion. "
        f"Use the following guidelines: \n\n "
@@ -27,7 +27,7 @@ def prompt_func(mode, lang):
        f"business: The text covers economy, business, or related topics. \n\n"
        f"If the text contains multiple topics, choose the dominant topic. "
        f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
-        "Please provide a single classification for each input.\n\ntext: {{headline}} \category: \n\n",
+        "Please provide a single classification for each input.\n\ntext: {{headline}} \\category: \n\n",
    }
    return prompt_map[mode]


--- a/lm_eval/tasks/afrobench/sib/utils.py
+++ b/lm_eval/tasks/afrobench/sib/utils.py
@@ -17,9 +17,9 @@ def prompt_func(mode, lang):
        "prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
        f"Given the categories science/technology, travel, politics, sports, health, entertainment, or geography; what is "
        f"the topic of the {lang} statement below? Return only the category. "
-        "\n\ntext: {{text}} \category:\n\n",
+        "\n\ntext: {{text}} \\category:\n\n",
        "prompt_4": "Label the following text as science/technology, travel, politics, sports, health, entertainment, or geography. Provide only the category as your "
-        "response. \n\ntext: {{text}} \category: \n\n",
+        "response. \n\ntext: {{text}} \\category: \n\n",
        "prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
        f"For each input, classify the topic as science/technology, travel, politics, sports, health, entertainment, or geography. "
        f"Use the following guidelines: \n\n "
@@ -32,7 +32,7 @@ def prompt_func(mode, lang):
        f"geography: The text involves geographical information, locations, or related topics. \n\n"
        f"If the text contains multiple topics, choose the dominant topic. "
        f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
-        "Please provide a single classification for each input.\n\ntext: {{text}} \category: \n\n",
+        "Please provide a single classification for each input.\n\ntext: {{text}} \\category: \n\n",
    }
    return prompt_map[mode]


--- a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum
@@ -4,8 +4,6 @@ tag:
 task: null
 dataset_path: csebuetnlp/xlsum
 dataset_name: null
-dataset_kwargs:
-  trust_remote_code: true
 output_type: generate_until
 generation_kwargs:
  until:

--- a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum
@@ -4,8 +4,6 @@ tag:
 task: null
 dataset_path: csebuetnlp/xlsum
 dataset_name: null
-dataset_kwargs:
-  trust_remote_code: true
 output_type: generate_until
 generation_kwargs:
  until:

--- a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum
@@ -4,8 +4,6 @@ tag:
 task: null
 dataset_path: csebuetnlp/xlsum
 dataset_name: null
-dataset_kwargs:
-  trust_remote_code: true
 output_type: generate_until
 generation_kwargs:
  until:

--- a/lm_eval/tasks/agieval/utils.py
+++ b/lm_eval/tasks/agieval/utils.py
@@ -47,7 +47,7 @@ def parse_math_answer(raw_string):
        return retval

    def get_answer_with_dollar_sign(s):
-        first_pattern = "\$(.*)\$"
+        first_pattern = r"\$(.*)\$"
        last_match = None
        matches = re.findall(first_pattern, s)
        if matches:
@@ -63,7 +63,7 @@ def parse_math_answer(raw_string):
            if "\\n" in last_match:
                last_match = last_match.split("\\n")[0]
        else:
-            pattern = "(?:\\$)?\d+(?:\.\d+)?(?![\w\d])"
+            pattern = "(?:\\$)?\\d+(?:\\.\\d+)?(?![\\w\\d])"
            matches = re.findall(pattern, s)
            if matches:
                last_match = matches[-1]
@@ -186,7 +186,7 @@ def _strip_string(string):

    # remove percentage
    string = string.replace("\\%", "")
-    string = string.replace("\%", "")
+    string = string.replace(r"\%", "")

    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
    string = string.replace(" .", " 0.")

--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
@@ -15,5 +15,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true