Merge branch 'smolrefact' into tasklist

# Conflicts: # lm_eval/__main__.py # lm_eval/api/group.py # lm_eval/api/task.py # lm_eval/evaluator_utils.py # lm_eval/tasks/__init__.py # lm_eval/utils.py # pyproject.toml

Merge branch 'smolrefact' into tasklist
# Conflicts: # lm_eval/__main__.py # lm_eval/api/group.py # lm_eval/api/task.py # lm_eval/evaluator_utils.py # lm_eval/tasks/__init__.py # lm_eval/utils.py # pyproject.toml
abd17276 · Baber · 00afd536 · 70314843 · abd17276 · abd17276
Commit abd17276 authored Sep 26, 2025 by Baber
20 changed files
--- a/lm_eval/config/__init__.py
+++ b/lm_eval/config/__init__.py
+from .evaluate_config import EvaluatorConfig
+__all__ = [
+    "EvaluatorConfig",
+]
--- a/lm_eval/config/evaluate_config.py
+++ b/lm_eval/config/evaluate_config.py
--- a/lm_eval/config/metric.py
+++ b/lm_eval/config/metric.py
+from __future__ import annotations
+from collections.abc import Callable, Mapping
+from dataclasses import dataclass, field
+from functools import cached_property
+from typing import Any
+@dataclass
+class MetricConfig:
+    """Encapsulates information about a single metric."""
+    name: str
+    fn: Callable
+    kwargs: Mapping[str, Any] = field(default_factory=dict)
+    aggregation_fn: Callable | None = None
+    higher_is_better: bool = True
+    hf_evaluate: bool = False
+    is_elementwise: bool = True
+    @cached_property
+    def metric_name(self) -> str:
+        return self.name
+    @cached_property
+    def aggregation(self) -> Callable[..., Any] | None:
+        from lm_eval.api.registry import get_aggregation
+        if self.aggregation_fn is None:
+            return get_aggregation(self.name)
+        return self.aggregation_fn
+    @cached_property
+    def _higher_is_better(self) -> bool | None:
+        from lm_eval.api.registry import is_higher_better
+        if self.higher_is_better is None:
+            return is_higher_better(self.name)
+        return self.higher_is_better
+    def compute(self, *args, **kwargs) -> Any:
+        """Calculates the metric using the provided function and arguments."""
+        if self.fn is None:
+            raise ValueError(f"Metric function for {self.name} is not defined.")
+        return self.fn(*args, **{**(self.kwargs or {}), **kwargs})
+    def compute_aggregation(self, *args, **kwargs) -> Any:
+        """Computes the aggregation of the metric values."""
+        if self.aggregation_fn is None:
+            raise ValueError(f"Aggregation function for {self.name} is not defined.")
+        return self.aggregation_fn(*args, **kwargs)
--- a/lm_eval/config/task.py
+++ b/lm_eval/config/task.py
--- a/lm_eval/config/template.py
+++ b/lm_eval/config/template.py
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Callable
+from lm_eval.config.utils import create_mc_choices
+if TYPE_CHECKING:
+    from lm_eval.config.metric import MetricConfig
+@dataclass
+class TemplateConfig(ABC):
+    """Encapsulates information about a template."""
+    #
+    template: str
+    task: str
+    doc_to_text: str | Callable[[dict], str] | list[str]
+    doc_to_choice: str | list | Callable[[dict], list]
+    doc_to_target: int | Callable[[dict], int]
+    description: str
+    context_prefix: str
+    prefix_delimiter: str
+    context_delimiter: str
+    answer_suffix: str
+    target_delimiter: str
+    choice_format: str | None
+    choice_delimiter: str | None
+    fewshot_delimiter: str
+    metric_list: list[str] | list[MetricConfig] | None = field(
+        default_factory=lambda: ["acc", "acc_norm"]
+    )
+    @abstractmethod
+    def _doc_to_text(self, doc: dict) -> str:
+        """Convert a document to text."""
+        raise NotImplementedError
+    def _doc_to_choice(self, doc: dict) -> str:
+        """Convert a document to choices."""
+        raise NotImplementedError
+    def _doc_to_target(self, doc: dict) -> int | str:
+        """Convert a document to target."""
+        raise NotImplementedError
+@dataclass
+class MCQTemplateConfig:
+    """Encapsulates information about a template.
+    Would return a sample with the following format:
+    Question: <doc_to_text(doc)>
+    A. <doc_to_choice(doc)[0]>
+    B. <doc_to_choice(doc)[1]>
+    C. <doc_to_choice(doc)[2]>
+    D. <doc_to_choice(doc)[3]>
+    Answer: 'doc_to_choice(doc)` for each choice.
+    """
+    doc_to_text: str | Callable[[dict], str]
+    doc_to_choice: list[str]
+    doc_to_target: int | Callable[[dict], int]
+    template = "mcq"
+    context_prefix: str = "Question:"
+    prefix_delimiter: str = " "
+    context_delimiter: str = "\n"
+    answer_suffix: str = "Answer:"
+    target_delimiter: str = "\n"
+    choice_format: str | None = "letters"
+    choice_delimiter: str = "\n"
+    fewshot_delimiter: str = "\n\n"
+    metric_list: list[MetricConfig] | None = field(default_factory=lambda: ["acc"])
+    def _doc_to_text(self, doc: dict) -> str:
+        """Convert a document to text."""
+        doc_to_text: str = (
+            self.doc_to_text
+            if isinstance(self.doc_to_text, str)
+            else self.doc_to_text(doc)
+        )
+        return (
+            self.context_prefix
+            + self.prefix_delimiter
+            + doc_to_text
+            + self.context_delimiter
+            + create_mc_choices(
+                self.doc_to_choice, choice_delimiter=self.choice_delimiter
+            )
+            + self.answer_suffix
+        )
+    def _doc_to_choice(self, doc: dict) -> str:
+        if callable(self.doc_to_choice):
+            doc_to_choice = self.doc_to_choice(doc)
+        elif isinstance(self.doc_to_choice, str):
+            doc_to_choice = doc[self.doc_to_choice]
+        else:
+            doc_to_choice = self.doc_to_choice
+        return create_mc_choices(doc_to_choice, choice_delimiter=self.choice_delimiter)
+    def _doc_to_target(self, doc: dict) -> int:
+        """Convert a document to target."""
+        if callable(self.doc_to_target):
+            return self.doc_to_target(doc)
+        elif isinstance(self.doc_to_target, str):
+            return doc[self.doc_to_target]
+        else:
+            return self.doc_to_target
+@dataclass
+class ClozeTemplateConfig(TemplateConfig):
+    """Encapsulates information about a template.
+    Would return a sample with the following format:
+    Question:  <doc_to_text(doc)>
+    Answer:` <doc_to_target(doc)>`
+    """
+    doc_to_text: str | Callable[[dict], str]
+    doc_to_choice: list[str]
+    doc_to_target: int | Callable[[dict], int]
+    template: str = "cloze"
+    description: str = ""
+    context_prefix: str = "Question:"
+    prefix_delimiter: str = " "
+    context_delimiter: str = "\n"
+    answer_suffix: str = "Answer:"
+    target_delimiter: str = " "
+    choice_format: str | None = None
+    choice_delimiter: str = ""
+    fewshot_delimiter: str = "\n\n"
+    metric_list: list[MetricConfig] | None = field(
+        default_factory=lambda: ["acc", "acc_norm"]
+    )
+    def _doc_to_text(self, doc: dict) -> str:
+        """Convert a document to text."""
+        doc_to_text: str = (
+            self.doc_to_text
+            if isinstance(self.doc_to_text, str)
+            else self.doc_to_text(doc)
+        )
+        return (
+            self.context_prefix
+            + self.prefix_delimiter
+            + doc_to_text
+            + self.context_delimiter
+            + self.answer_suffix
+        )
+    def _doc_to_choice(self, doc: dict) -> str:
+        if callable(self.doc_to_choice):
+            doc_to_choice = self.doc_to_choice(doc)
+        elif isinstance(self.doc_to_choice, str):
+            doc_to_choice = doc[self.doc_to_choice]
+        else:
+            doc_to_choice = self.doc_to_choice
+        return create_mc_choices(doc_to_choice, choice_delimiter=self.choice_delimiter)
+    def _doc_to_target(self, doc: dict) -> int:
+        """Convert a document to target."""
+        if callable(self.doc_to_target):
+            return self.doc_to_target(doc)
+        elif isinstance(self.doc_to_target, str):
+            return doc[self.doc_to_target]
+        else:
+            return self.doc_to_target
--- a/lm_eval/config/utils.py
+++ b/lm_eval/config/utils.py
--- a/lm_eval/decontamination/archiver.py
+++ b/lm_eval/decontamination/archiver.py
--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
@@ -5,8 +5,9 @@ import traceback
 from typing import Iterator, List, Sequence, Tuple, TypeVar
-# This is a cpp module. Compile janitor_util.cpp with:
+# This is a cpp module.
-# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
+# See scripts/clean_training_data/README.md for instructions to compile janitor_util.cpp
 try:
    import janitor_util

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
--- a/lm_eval/filters/decontamination.py
+++ b/lm_eval/filters/decontamination.py
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
--- a/lm_eval/filters/selection.py
+++ b/lm_eval/filters/selection.py
@@ -27,7 +27,6 @@ class TakeFirstFilter(Filter):
 class TakeKFilter(Filter):
    def __init__(self, **kwargs) -> None:
        self.k = kwargs.pop("k")
        super().__init__(**kwargs)
    def apply(self, resps, docs):

--- a/lm_eval/filters/transformation.py
+++ b/lm_eval/filters/transformation.py
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
--- a/lm_eval/models/hf_steered.py
+++ b/lm_eval/models/hf_steered.py
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
--- a/lm_eval/models/ibm_watsonx_ai.py
+++ b/lm_eval/models/ibm_watsonx_ai.py