Merge remote-tracking branch 'origin/smolrefact' into smolrefact

42478664 · Baber · 4d3387f6 · 003e5852 · 42478664 · 42478664
Commit 42478664 authored Oct 04, 2025 by Baber
20 changed files
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -8,8 +8,6 @@ on:
    branches:
      - 'main'
  pull_request:
-    branches:
-      - 'main'
  workflow_dispatch:
 # Jobs run concurrently and steps run sequentially within a job.
 # jobs: linter and cpu_tests. Add more jobs/steps as required.

--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
 from dataclasses import dataclass, field
-from typing import Literal, Optional, Tuple
+from typing import Any, Literal, Optional
 OutputType = Literal[
@@ -10,10 +10,10 @@ OutputType = Literal[
 @dataclass
 class Instance:
    request_type: OutputType
-    doc: dict
+    doc: dict[str, Any]
    arguments: tuple
    idx: int
-    metadata: Tuple[Optional[str], Optional[int], Optional[int]] = field(
+    metadata: tuple[Optional[str], Optional[int], Optional[int]] = field(
        default_factory=lambda: (None, None, None),
        metadata=dict(
            description="Metadata tuple containing task name, document ID, and number of repeats."

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -213,7 +213,7 @@ def exact_match_hf_evaluate(
    ignore_case: bool = False,
    ignore_punctuation: bool = False,
    ignore_numbers: bool = False,
-    multi_target: bool = False,
+    multiple_targets: bool = False,
 ):
    """
    Compute exact match scores between predictions and references.
@@ -245,8 +245,8 @@ def exact_match_hf_evaluate(
            - "exact_match" (float): The mean exact match score or 1.0/0.0 if `multi_target` is True.
    """
    predictions, references = list(predictions), list(references)
-    assert len(predictions) == len(references) if not multi_target else True, (
+    assert len(predictions) == len(references) if not multiple_targets else True, (
-        "predictions and references must have the same length unless `multi_target` is True"
+        "predictions and references must have the same length unless `multiple_targets` is True"
    )
    if regexes_to_ignore is not None:
@@ -275,7 +275,7 @@ def exact_match_hf_evaluate(
    return {
        "exact_match": np.mean(score_list)
-        if not multi_target
+        if not multiple_targets
        else float(np.any(score_list))
    }

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
 from __future__ import annotations
 import logging
-import warnings
+from random import Random
-from collections.abc import Iterable, Sequence
+from typing import TYPE_CHECKING
-from functools import partial
-from typing import TYPE_CHECKING, Any
-import datasets
 if TYPE_CHECKING:
-    from random import Random
+    from collections.abc import Iterable, Sequence
+    from typing import Any, TypeVar
-    from lm_eval.api.task import ConfigurableTask, Task
+    _T = TypeVar("_T")
-eval_logger = logging.getLogger("lm-eval")
+eval_logger = logging.getLogger(__name__)
 class ContextSampler:
    def __init__(
        self,
-        docs: list[dict],
+        docs: Sequence[dict[str, Any]] | None = None,
-        task: Task | ConfigurableTask,
+        *,
-        fewshot_indices: Iterable | None = None,
+        rnd: int | None = None,
-        rnd: Random | None = None,
+        fewshot_indices: list[int] | None = None,
+        **kwargs,
    ) -> None:
-        self.rnd = rnd
+        self.rnd = Random(rnd)
-        if not self.rnd:
+        self.docs = docs or []
-            raise ValueError(
+        self.fewshot_indices = fewshot_indices
-                "A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!"
-            )
-        self.task = task
-        self.config = task._config
-        self.target_delimiter = self.config.target_delimiter
-        self.fewshot_delimiter = self.config.fewshot_delimiter
-        if (
-            self.config.fewshot_config is not None
-            and self.config.fewshot_config.get("doc_to_text", None) is not None
-        ):
-            self.doc_to_text = partial(
-                self.task.doc_to_text,
-                doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
-            )
-        else:
-            self.doc_to_text = self.task.doc_to_text
-        if (
-            self.config.fewshot_config is not None
-            and self.config.fewshot_config.get("doc_to_target", None) is not None
-        ):
-            self.doc_to_target = partial(
-                self.task.doc_to_target,
-                doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
-            )
-        else:
-            self.doc_to_target = self.task.doc_to_target
-        if (
-            self.config.fewshot_config is not None
-            and self.config.fewshot_config.get("doc_to_choice", None) is not None
-        ):
-            self.doc_to_choice = partial(
-                self.task.doc_to_choice,
-                doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
-            )
-        else:
-            self.doc_to_choice = self.task.doc_to_choice
-        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
-        if fewshot_indices:  # subset few-shot docs from
-            if not isinstance(self.docs, datasets.Dataset):
-                raise ValueError(
-                    "Got `fewshot_indices` but fewshot_docs are not a HF dataset. Don't use both `fewshot_indices` and a user-defined few-shot sample list simultaneously"
-                )
-            self.docs = self.docs.select(fewshot_indices)
-    def get_context(self, doc: dict, num_fewshot: int, gen_prefix: str | None = None):
-        # draw an extra fewshot sample if using same split as evaluating on
-        prefix = gen_prefix + " " if gen_prefix else ""
-        n_samples = (
-            num_fewshot + 1
-            if self.config.fewshot_split == self.config.test_split
-            else num_fewshot
-        )
-        # draw `n_samples` docs from fewshot_docs
+        if self.fewshot_indices and self.docs:
-        fewshotex = self.sample(n_samples)
+            self.docs = [self.docs[i] for i in self.fewshot_indices]
-        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+    def sample(
-        # TODO: should we just stop people from using fewshot from same split as evaluating?
+        self, n: int, doc: dict[str, Any] | None = None, **kwargs
-        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+    ) -> Sequence[dict]:
-        labeled_examples = ""
-        for doc in selected_docs:
-            doc_content = self.doc_to_text(doc)
-            doc_target = self.doc_to_target(doc)
-            if (
-                self.config.doc_to_choice is None and isinstance(doc_content, str)
-            ) or isinstance(doc_content, str):
-                labeled_examples += doc_content
-            else:
-                if isinstance(doc_content, int):
-                    labeled_examples += self.doc_to_choice(doc)[doc_content]
-            if doc_target != "":
-                if self.target_delimiter.isspace() and str(doc_target)[0].isspace():
-                    # TODO: add logger warn once here.
-                    warnings.warn(
-                        "Both target_delimiter and target start with a space. This may cause issues.",
-                        Warning,
-                        stacklevel=2,
-                    )
-                labeled_examples += self.target_delimiter
-                labeled_examples += prefix
-                labeled_examples += (
-                    str(doc_target[0])
-                    if isinstance(doc_target, list)
-                    else doc_target
-                    if self.config.doc_to_choice is None or isinstance(doc_target, str)
-                    else str(self.doc_to_choice(doc)[doc_target])
-                )
-                labeled_examples += self.fewshot_delimiter
-        return labeled_examples
-    def get_chat_context(
-        self,
-        doc: dict,
-        num_fewshot: int,
-        fewshot_as_multiturn: bool = False,
-        gen_prefix: str | None = None,
-    ):
-        # TODO: Do we need any other delimiter
-        prefix = gen_prefix + " " if gen_prefix else ""
-        chat_history = []
-        # draw an extra fewshot sample if using same split as evaluating on
-        n_samples = (
-            num_fewshot + 1
-            if self.config.fewshot_split == self.config.test_split
-            else num_fewshot
-        )
-        # draw `n_samples` docs from fewshot_docs
-        fewshotex = self.sample(n_samples)
-        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
-        # TODO: should we just stop people from using fewshot from same split as evaluating?
-        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
-        if fewshot_as_multiturn:
-            for doc in selected_docs:
-                doc_content = self.doc_to_text(doc)
-                doc_target = self.doc_to_target(doc)
-                chat_history.append(
-                    {
-                        "role": "user",
-                        "content": doc_content
-                        if self.config.doc_to_choice is None
-                        or isinstance(doc_content, str)
-                        else self.doc_to_choice(doc)[doc_content],
-                    }
-                )
-                chat_history.append(
-                    {
-                        "role": "assistant",
-                        "content": prefix + str(doc_target[0])
-                        if isinstance(doc_target, list)
-                        else prefix + doc_target
-                        if self.config.doc_to_choice is None
-                        or isinstance(doc_target, str)
-                        else prefix + str(self.doc_to_choice(doc)[doc_target]),
-                    }
-                )
-        else:
-            # get fewshot context as one user turn
-            chat_history.append(
-                {
-                    "role": "user",
-                    "content": self.get_context(
-                        doc, num_fewshot, gen_prefix=gen_prefix
-                    ),
-                }
-            )
-        return chat_history
-    # @classmethod
-    # def from_fewshot_dfg(cls, cfg: FewshotConfig):
-    #     if not
-    def sample(self, n: int) -> Sequence[dict]:
        """
-        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
+        Sample n documents from the pool.
+        Args:
+            n: Number of documents to sample
+            doc: Optional document to exclude from sampling
+        Returns:
+            List of sampled documents
        """
-        assert self.rnd is not None, (
+        if n <= 0:
-            "Error: `rnd` must be set to a random.Random instance before sampling."
+            return []
+        return (
+            self.rnd.sample(self.docs, n)
+            if not doc
+            else self.remove_doc(doc, self.rnd.sample(self.docs, n + 1))
        )
-        return self.rnd.sample(self.docs, n)
+    def set_rnd(self, rnd: int) -> None:
+        self.rnd = Random(rnd)
+    @staticmethod
+    def remove_doc(doc: _T, _iter: Iterable[_T]) -> list[_T]:
+        return [x for x in _iter if x != doc]
 class FirstNSampler(ContextSampler):
-    def sample(self, n: int) -> Sequence[dict[str, Any]]:
+    def sample(self, n: int, doc=None, **kwargs):
        """
        Draw the first `n` samples in order from the specified split.
        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
@@ -214,7 +72,7 @@ class FirstNSampler(ContextSampler):
 class BalancedSampler(ContextSampler):
-    def sample(self, n: int):
+    def sample(self, n: int, doc=None, **kwargs):
        """
        TODO: this should return approximately class-balanced samples from our fewshot examples.
        TODO: what order should they be in? maybe random?
@@ -224,7 +82,7 @@ class BalancedSampler(ContextSampler):
 class ManualSampler(ContextSampler):
-    def sample(self, n: int):
+    def sample(self, n: int, doc=None, **kwargs):
        """ """
        raise NotImplementedError

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
--- a/lm_eval/tasks/nq_open/README.md
+++ b/lm_eval/tasks/nq_open/README.md
@@ -8,8 +8,8 @@ Homepage: [google-research-datasets/natural-questions@master/nq_open](https://gi
 Paper: [aclanthology.org/P19-1612](https://aclanthology.org/P19-1612/)
-Derived from the Natural Questions dataset, introduced in https://storage.googleapis.com/gweb-research2023-media/pubtools/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf .
+Derived from the Natural Questions dataset, introduced
+in https://storage.googleapis.com/gweb-research2023-media/pubtools/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf .
 ### Citation
@@ -26,4 +26,5 @@ journal	= {Transactions of the Association of Computational Linguistics}}
 * `nq_open`
 ### Changelog
-* 2025-07-21: Added `multi_target` to `exact_match`. Scores should not change.
+* 2025-07-21: Added `multiple_targets` to `exact_match`. Scores should not change.
--- a/lm_eval/tasks/nq_open/nq_open.yaml
+++ b/lm_eval/tasks/nq_open/nq_open.yaml
@@ -5,7 +5,7 @@ training_split: train
 validation_split: validation
 description: "Answer these questions:\n\n"
 doc_to_text: "Q: {{question}}?\nA:"
-doc_to_target: "{{answer}}"
+doc_to_target: answer
 fewshot_delimiter: "\n"
 generation_kwargs:
  until:
@@ -27,7 +27,7 @@ metric_list:
    ignore_case: true
    ignore_punctuation: true
    regexes_to_ignore:
-    - "\\b(?:The |the |An |A |The |a |an )"
+      - "\\b(?:The |the |An |A |The |a |an )"
-    multi_target: true
+    multiple_targets: true
 metadata:
  version: 4.0
--- a/lm_eval/tasks/super_glue/README.md
+++ b/lm_eval/tasks/super_glue/README.md
@@ -79,3 +79,6 @@ If other tasks on this dataset are already supported:
 * [ ] Is the "Main" variant of this task clearly denoted?
 * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
 * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+### Changelog
+- 2025-07-22: `record` and `multirc`: set target_delimiter to "" and trim doc_to_text respectively.
--- a/lm_eval/tasks/super_glue/boolq/default.yaml
+++ b/lm_eval/tasks/super_glue/boolq/default.yaml
 tag:
  - super-glue-lm-eval-v1
 task: boolq
-dataset_path: super_glue
+dataset_path: aps/super_glue
 dataset_name: boolq
 output_type: multiple_choice
 training_split: train

--- a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+++ b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
 tag:
  - super-glue-lm-eval-v1-seq2seq
 task: "boolq-seq2seq"
-dataset_path: super_glue
+dataset_path: aps/super_glue
 dataset_name: boolq
 output_type: generate_until
 training_split: train
 validation_split: validation
 doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
-doc_to_target: label
+doc_to_target: "{{ [' no', ' yes'][label|int] }}"
-doc_to_choice: [' no', ' yes']
 target_delimiter: ""
 generation_kwargs:
  until:

--- a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
 tag:
  - super-glue-t5-prompt
 task: super_glue-boolq-t5-prompt
-dataset_path: super_glue
+dataset_path: aps/super_glue
 dataset_name: boolq
 training_split: train
 validation_split: validation
 output_type: generate_until
 doc_to_text: "boolq passage: {{passage}} question: {{question}}"
-doc_to_target: label
+doc_to_target: "{{['False', 'True'][label|int]}}"
-doc_to_choice: ['False', 'True']
 generation_kwargs:
  until:
    - "</s>"

--- a/lm_eval/tasks/super_glue/cb/default.yaml
+++ b/lm_eval/tasks/super_glue/cb/default.yaml
 tag:
  - super-glue-lm-eval-v1
 task: cb
-dataset_path: super_glue
+dataset_path: aps/super_glue
 dataset_name: cb
 output_type: multiple_choice
 training_split: train
 validation_split: validation
 doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:"
 doc_to_target: label
-doc_to_choice: ['True', 'False', 'Neither']
+doc_to_choice: ["True", "False", "Neither"]
 metric_list:
  - metric: acc
  - metric: f1

--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
 tag:
  - super-glue-t5-prompt
 task: super_glue-cb-t5-prompt
-dataset_path: super_glue
+dataset_path: aps/super_glue
 dataset_name: cb
 training_split: train
 validation_split: validation
 output_type: generate_until
 doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
-doc_to_target: label
+doc_to_target: "{{ ['entailment', 'contradiction', 'neutral'][label|int] }}"
-doc_to_choice: ['entailment', 'contradiction', 'neutral']
 generation_kwargs:
  until:
    - "</s>"

--- a/lm_eval/tasks/super_glue/copa/default.yaml
+++ b/lm_eval/tasks/super_glue/copa/default.yaml
 tag:
  - super-glue-lm-eval-v1
 task: copa
-dataset_path: super_glue
+dataset_path: aps/super_glue
 dataset_name: copa
 output_type: multiple_choice
 training_split: train
 validation_split: validation
 doc_to_text: !function utils.doc_to_text
-doc_to_target: !function utils.doc_to_target
+doc_to_target: label
-doc_to_choice: !function utils.doc_to_choice
+doc_to_choice: ["{{ choice1 }}",  "{{ choice2 }}"]
 metric_list:
  - metric: acc
 metadata:

--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
 tag:
  - super-glue-t5-prompt
 task: super_glue-copa-t5-prompt
-dataset_path: super_glue
+dataset_path: aps/super_glue
 dataset_name: copa
 training_split: train
 validation_split: validation
 output_type: generate_until
 doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
-doc_to_target: label
+doc_to_target: "{{ [choice1, choice2][label|int] }}"
-doc_to_choice: ['choice1', 'choice2']
 generation_kwargs:
  until:
    - "</s>"

--- a/lm_eval/tasks/super_glue/multirc/default.yaml
+++ b/lm_eval/tasks/super_glue/multirc/default.yaml
 tag:
  - super-glue-lm-eval-v1
 task: multirc
-dataset_path: super_glue
+dataset_path: aps/super_glue
 dataset_name: multirc
 output_type: multiple_choice
 training_split: train

--- a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
 tag:
  - super-glue-t5-prompt
 task: super_glue-multirc-t5-prompt
-dataset_path: super_glue
+dataset_path: aps/super_glue
 dataset_name: multirc
 training_split: train
 validation_split: validation
 output_type: generate_until
-doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}"
+doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}|trim"
-doc_to_target: label
+doc_to_target: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True'][label]}}"
-doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}"
 generation_kwargs:
  until:
    - "</s>"

--- a/lm_eval/tasks/super_glue/record/default.yaml
+++ b/lm_eval/tasks/super_glue/record/default.yaml
 tag:
  - super-glue-lm-eval-v1
 task: record
-dataset_path: super_glue
+dataset_path: aps/super_glue
 dataset_name: record
 output_type: multiple_choice
 training_split: train
@@ -11,6 +11,7 @@ doc_to_target: !function util.doc_to_target
 doc_to_choice: !function util.doc_to_choice
 process_docs: !function util.process_docs
 process_results: !function util.process_results
+target_delimiter: ""
 metric_list:
  - metric: f1
    aggregation: mean

--- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
 tag:
  - super-glue-t5-prompt
 task: super_glue-record-t5-prompt
-dataset_path: super_glue
+dataset_path: aps/super_glue
 dataset_name: record
 validation_split: validation
 output_type: generate_until

--- a/lm_eval/tasks/super_glue/record/util.py
+++ b/lm_eval/tasks/super_glue/record/util.py
@@ -19,7 +19,7 @@ def format_answer(query, entity):
 def doc_to_target(doc):
    # We only output the first correct entity in a doc
-    return format_answer(query=doc["query"], entity=doc["answers"][0])
+    return doc["entities"].index(doc["answers"][0])
 def doc_to_choice(doc):