Commit 42478664 authored by Baber's avatar Baber
Browse files

Merge remote-tracking branch 'origin/smolrefact' into smolrefact

parents 4d3387f6 003e5852
......@@ -8,8 +8,6 @@ on:
branches:
- 'main'
pull_request:
branches:
- 'main'
workflow_dispatch:
# Jobs run concurrently and steps run sequentially within a job.
# jobs: linter and cpu_tests. Add more jobs/steps as required.
......
from dataclasses import dataclass, field
from typing import Literal, Optional, Tuple
from typing import Any, Literal, Optional
OutputType = Literal[
......@@ -10,10 +10,10 @@ OutputType = Literal[
@dataclass
class Instance:
request_type: OutputType
doc: dict
doc: dict[str, Any]
arguments: tuple
idx: int
metadata: Tuple[Optional[str], Optional[int], Optional[int]] = field(
metadata: tuple[Optional[str], Optional[int], Optional[int]] = field(
default_factory=lambda: (None, None, None),
metadata=dict(
description="Metadata tuple containing task name, document ID, and number of repeats."
......
......@@ -213,7 +213,7 @@ def exact_match_hf_evaluate(
ignore_case: bool = False,
ignore_punctuation: bool = False,
ignore_numbers: bool = False,
multi_target: bool = False,
multiple_targets: bool = False,
):
"""
Compute exact match scores between predictions and references.
......@@ -245,8 +245,8 @@ def exact_match_hf_evaluate(
- "exact_match" (float): The mean exact match score or 1.0/0.0 if `multi_target` is True.
"""
predictions, references = list(predictions), list(references)
assert len(predictions) == len(references) if not multi_target else True, (
"predictions and references must have the same length unless `multi_target` is True"
assert len(predictions) == len(references) if not multiple_targets else True, (
"predictions and references must have the same length unless `multiple_targets` is True"
)
if regexes_to_ignore is not None:
......@@ -275,7 +275,7 @@ def exact_match_hf_evaluate(
return {
"exact_match": np.mean(score_list)
if not multi_target
if not multiple_targets
else float(np.any(score_list))
}
......
from __future__ import annotations
import logging
import warnings
from collections.abc import Iterable, Sequence
from functools import partial
from typing import TYPE_CHECKING, Any
import datasets
from random import Random
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from random import Random
from collections.abc import Iterable, Sequence
from typing import Any, TypeVar
from lm_eval.api.task import ConfigurableTask, Task
_T = TypeVar("_T")
eval_logger = logging.getLogger("lm-eval")
eval_logger = logging.getLogger(__name__)
class ContextSampler:
def __init__(
self,
docs: list[dict],
task: Task | ConfigurableTask,
fewshot_indices: Iterable | None = None,
rnd: Random | None = None,
docs: Sequence[dict[str, Any]] | None = None,
*,
rnd: int | None = None,
fewshot_indices: list[int] | None = None,
**kwargs,
) -> None:
self.rnd = rnd
if not self.rnd:
raise ValueError(
"A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!"
)
self.task = task
self.config = task._config
self.target_delimiter = self.config.target_delimiter
self.fewshot_delimiter = self.config.fewshot_delimiter
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_text", None) is not None
):
self.doc_to_text = partial(
self.task.doc_to_text,
doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
)
else:
self.doc_to_text = self.task.doc_to_text
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_target", None) is not None
):
self.doc_to_target = partial(
self.task.doc_to_target,
doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
)
else:
self.doc_to_target = self.task.doc_to_target
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_choice", None) is not None
):
self.doc_to_choice = partial(
self.task.doc_to_choice,
doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
)
else:
self.doc_to_choice = self.task.doc_to_choice
self.docs = docs # HF dataset split, provided by task._fewshot_docs()
if fewshot_indices: # subset few-shot docs from
if not isinstance(self.docs, datasets.Dataset):
raise ValueError(
"Got `fewshot_indices` but fewshot_docs are not a HF dataset. Don't use both `fewshot_indices` and a user-defined few-shot sample list simultaneously"
)
self.docs = self.docs.select(fewshot_indices)
def get_context(self, doc: dict, num_fewshot: int, gen_prefix: str | None = None):
# draw an extra fewshot sample if using same split as evaluating on
prefix = gen_prefix + " " if gen_prefix else ""
n_samples = (
num_fewshot + 1
if self.config.fewshot_split == self.config.test_split
else num_fewshot
)
self.rnd = Random(rnd)
self.docs = docs or []
self.fewshot_indices = fewshot_indices
# draw `n_samples` docs from fewshot_docs
fewshotex = self.sample(n_samples)
# get rid of the doc that's the one we're evaluating, if it's in the fewshot
# TODO: should we just stop people from using fewshot from same split as evaluating?
selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
labeled_examples = ""
for doc in selected_docs:
doc_content = self.doc_to_text(doc)
doc_target = self.doc_to_target(doc)
if (
self.config.doc_to_choice is None and isinstance(doc_content, str)
) or isinstance(doc_content, str):
labeled_examples += doc_content
else:
if isinstance(doc_content, int):
labeled_examples += self.doc_to_choice(doc)[doc_content]
if doc_target != "":
if self.target_delimiter.isspace() and str(doc_target)[0].isspace():
# TODO: add logger warn once here.
warnings.warn(
"Both target_delimiter and target start with a space. This may cause issues.",
Warning,
stacklevel=2,
)
labeled_examples += self.target_delimiter
labeled_examples += prefix
labeled_examples += (
str(doc_target[0])
if isinstance(doc_target, list)
else doc_target
if self.config.doc_to_choice is None or isinstance(doc_target, str)
else str(self.doc_to_choice(doc)[doc_target])
)
labeled_examples += self.fewshot_delimiter
return labeled_examples
def get_chat_context(
self,
doc: dict,
num_fewshot: int,
fewshot_as_multiturn: bool = False,
gen_prefix: str | None = None,
):
# TODO: Do we need any other delimiter
prefix = gen_prefix + " " if gen_prefix else ""
chat_history = []
# draw an extra fewshot sample if using same split as evaluating on
n_samples = (
num_fewshot + 1
if self.config.fewshot_split == self.config.test_split
else num_fewshot
)
# draw `n_samples` docs from fewshot_docs
fewshotex = self.sample(n_samples)
# get rid of the doc that's the one we're evaluating, if it's in the fewshot
# TODO: should we just stop people from using fewshot from same split as evaluating?
selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
if fewshot_as_multiturn:
for doc in selected_docs:
doc_content = self.doc_to_text(doc)
doc_target = self.doc_to_target(doc)
chat_history.append(
{
"role": "user",
"content": doc_content
if self.config.doc_to_choice is None
or isinstance(doc_content, str)
else self.doc_to_choice(doc)[doc_content],
}
)
chat_history.append(
{
"role": "assistant",
"content": prefix + str(doc_target[0])
if isinstance(doc_target, list)
else prefix + doc_target
if self.config.doc_to_choice is None
or isinstance(doc_target, str)
else prefix + str(self.doc_to_choice(doc)[doc_target]),
}
)
else:
# get fewshot context as one user turn
chat_history.append(
{
"role": "user",
"content": self.get_context(
doc, num_fewshot, gen_prefix=gen_prefix
),
}
)
return chat_history
# @classmethod
# def from_fewshot_dfg(cls, cfg: FewshotConfig):
# if not
def sample(self, n: int) -> Sequence[dict]:
if self.fewshot_indices and self.docs:
self.docs = [self.docs[i] for i in self.fewshot_indices]
def sample(
self, n: int, doc: dict[str, Any] | None = None, **kwargs
) -> Sequence[dict]:
"""
Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
Sample n documents from the pool.
Args:
n: Number of documents to sample
doc: Optional document to exclude from sampling
Returns:
List of sampled documents
"""
assert self.rnd is not None, (
"Error: `rnd` must be set to a random.Random instance before sampling."
if n <= 0:
return []
return (
self.rnd.sample(self.docs, n)
if not doc
else self.remove_doc(doc, self.rnd.sample(self.docs, n + 1))
)
return self.rnd.sample(self.docs, n)
def set_rnd(self, rnd: int) -> None:
self.rnd = Random(rnd)
@staticmethod
def remove_doc(doc: _T, _iter: Iterable[_T]) -> list[_T]:
return [x for x in _iter if x != doc]
class FirstNSampler(ContextSampler):
def sample(self, n: int) -> Sequence[dict[str, Any]]:
def sample(self, n: int, doc=None, **kwargs):
"""
Draw the first `n` samples in order from the specified split.
Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
......@@ -214,7 +72,7 @@ class FirstNSampler(ContextSampler):
class BalancedSampler(ContextSampler):
def sample(self, n: int):
def sample(self, n: int, doc=None, **kwargs):
"""
TODO: this should return approximately class-balanced samples from our fewshot examples.
TODO: what order should they be in? maybe random?
......@@ -224,7 +82,7 @@ class BalancedSampler(ContextSampler):
class ManualSampler(ContextSampler):
def sample(self, n: int):
def sample(self, n: int, doc=None, **kwargs):
""" """
raise NotImplementedError
......
This diff is collapsed.
......@@ -8,8 +8,8 @@ Homepage: [google-research-datasets/natural-questions@master/nq_open](https://gi
Paper: [aclanthology.org/P19-1612](https://aclanthology.org/P19-1612/)
Derived from the Natural Questions dataset, introduced in https://storage.googleapis.com/gweb-research2023-media/pubtools/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf .
Derived from the Natural Questions dataset, introduced
in https://storage.googleapis.com/gweb-research2023-media/pubtools/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf .
### Citation
......@@ -26,4 +26,5 @@ journal = {Transactions of the Association of Computational Linguistics}}
* `nq_open`
### Changelog
* 2025-07-21: Added `multi_target` to `exact_match`. Scores should not change.
* 2025-07-21: Added `multiple_targets` to `exact_match`. Scores should not change.
......@@ -5,7 +5,7 @@ training_split: train
validation_split: validation
description: "Answer these questions:\n\n"
doc_to_text: "Q: {{question}}?\nA:"
doc_to_target: "{{answer}}"
doc_to_target: answer
fewshot_delimiter: "\n"
generation_kwargs:
until:
......@@ -27,7 +27,7 @@ metric_list:
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- "\\b(?:The |the |An |A |The |a |an )"
multi_target: true
- "\\b(?:The |the |An |A |The |a |an )"
multiple_targets: true
metadata:
version: 4.0
......@@ -79,3 +79,6 @@ If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
### Changelog
- 2025-07-22: `record` and `multirc`: set target_delimiter to "" and trim doc_to_text respectively.
tag:
- super-glue-lm-eval-v1
task: boolq
dataset_path: super_glue
dataset_path: aps/super_glue
dataset_name: boolq
output_type: multiple_choice
training_split: train
......
tag:
- super-glue-lm-eval-v1-seq2seq
task: "boolq-seq2seq"
dataset_path: super_glue
dataset_path: aps/super_glue
dataset_name: boolq
output_type: generate_until
training_split: train
validation_split: validation
doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
doc_to_target: label
doc_to_choice: [' no', ' yes']
doc_to_target: "{{ [' no', ' yes'][label|int] }}"
target_delimiter: ""
generation_kwargs:
until:
......
tag:
- super-glue-t5-prompt
task: super_glue-boolq-t5-prompt
dataset_path: super_glue
dataset_path: aps/super_glue
dataset_name: boolq
training_split: train
validation_split: validation
output_type: generate_until
doc_to_text: "boolq passage: {{passage}} question: {{question}}"
doc_to_target: label
doc_to_choice: ['False', 'True']
doc_to_target: "{{['False', 'True'][label|int]}}"
generation_kwargs:
until:
- "</s>"
......
tag:
- super-glue-lm-eval-v1
task: cb
dataset_path: super_glue
dataset_path: aps/super_glue
dataset_name: cb
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:"
doc_to_target: label
doc_to_choice: ['True', 'False', 'Neither']
doc_to_choice: ["True", "False", "Neither"]
metric_list:
- metric: acc
- metric: f1
......
tag:
- super-glue-t5-prompt
task: super_glue-cb-t5-prompt
dataset_path: super_glue
dataset_path: aps/super_glue
dataset_name: cb
training_split: train
validation_split: validation
output_type: generate_until
doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
doc_to_target: label
doc_to_choice: ['entailment', 'contradiction', 'neutral']
doc_to_target: "{{ ['entailment', 'contradiction', 'neutral'][label|int] }}"
generation_kwargs:
until:
- "</s>"
......
tag:
- super-glue-lm-eval-v1
task: copa
dataset_path: super_glue
dataset_path: aps/super_glue
dataset_name: copa
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
doc_to_target: label
doc_to_choice: ["{{ choice1 }}", "{{ choice2 }}"]
metric_list:
- metric: acc
metadata:
......
tag:
- super-glue-t5-prompt
task: super_glue-copa-t5-prompt
dataset_path: super_glue
dataset_path: aps/super_glue
dataset_name: copa
training_split: train
validation_split: validation
output_type: generate_until
doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
doc_to_target: label
doc_to_choice: ['choice1', 'choice2']
doc_to_target: "{{ [choice1, choice2][label|int] }}"
generation_kwargs:
until:
- "</s>"
......
tag:
- super-glue-lm-eval-v1
task: multirc
dataset_path: super_glue
dataset_path: aps/super_glue
dataset_name: multirc
output_type: multiple_choice
training_split: train
......
tag:
- super-glue-t5-prompt
task: super_glue-multirc-t5-prompt
dataset_path: super_glue
dataset_path: aps/super_glue
dataset_name: multirc
training_split: train
validation_split: validation
output_type: generate_until
doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}"
doc_to_target: label
doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}"
doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}|trim"
doc_to_target: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True'][label]}}"
generation_kwargs:
until:
- "</s>"
......
tag:
- super-glue-lm-eval-v1
task: record
dataset_path: super_glue
dataset_path: aps/super_glue
dataset_name: record
output_type: multiple_choice
training_split: train
......@@ -11,6 +11,7 @@ doc_to_target: !function util.doc_to_target
doc_to_choice: !function util.doc_to_choice
process_docs: !function util.process_docs
process_results: !function util.process_results
target_delimiter: ""
metric_list:
- metric: f1
aggregation: mean
......
tag:
- super-glue-t5-prompt
task: super_glue-record-t5-prompt
dataset_path: super_glue
dataset_path: aps/super_glue
dataset_name: record
validation_split: validation
output_type: generate_until
......
......@@ -19,7 +19,7 @@ def format_answer(query, entity):
def doc_to_target(doc):
# We only output the first correct entity in a doc
return format_answer(query=doc["query"], entity=doc["answers"][0])
return doc["entities"].index(doc["answers"][0])
def doc_to_choice(doc):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment