Commit 42478664 authored by Baber's avatar Baber
Browse files

Merge remote-tracking branch 'origin/smolrefact' into smolrefact

parents 4d3387f6 003e5852
...@@ -8,8 +8,6 @@ on: ...@@ -8,8 +8,6 @@ on:
branches: branches:
- 'main' - 'main'
pull_request: pull_request:
branches:
- 'main'
workflow_dispatch: workflow_dispatch:
# Jobs run concurrently and steps run sequentially within a job. # Jobs run concurrently and steps run sequentially within a job.
# jobs: linter and cpu_tests. Add more jobs/steps as required. # jobs: linter and cpu_tests. Add more jobs/steps as required.
......
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Literal, Optional, Tuple from typing import Any, Literal, Optional
OutputType = Literal[ OutputType = Literal[
...@@ -10,10 +10,10 @@ OutputType = Literal[ ...@@ -10,10 +10,10 @@ OutputType = Literal[
@dataclass @dataclass
class Instance: class Instance:
request_type: OutputType request_type: OutputType
doc: dict doc: dict[str, Any]
arguments: tuple arguments: tuple
idx: int idx: int
metadata: Tuple[Optional[str], Optional[int], Optional[int]] = field( metadata: tuple[Optional[str], Optional[int], Optional[int]] = field(
default_factory=lambda: (None, None, None), default_factory=lambda: (None, None, None),
metadata=dict( metadata=dict(
description="Metadata tuple containing task name, document ID, and number of repeats." description="Metadata tuple containing task name, document ID, and number of repeats."
......
...@@ -213,7 +213,7 @@ def exact_match_hf_evaluate( ...@@ -213,7 +213,7 @@ def exact_match_hf_evaluate(
ignore_case: bool = False, ignore_case: bool = False,
ignore_punctuation: bool = False, ignore_punctuation: bool = False,
ignore_numbers: bool = False, ignore_numbers: bool = False,
multi_target: bool = False, multiple_targets: bool = False,
): ):
""" """
Compute exact match scores between predictions and references. Compute exact match scores between predictions and references.
...@@ -245,8 +245,8 @@ def exact_match_hf_evaluate( ...@@ -245,8 +245,8 @@ def exact_match_hf_evaluate(
- "exact_match" (float): The mean exact match score or 1.0/0.0 if `multi_target` is True. - "exact_match" (float): The mean exact match score or 1.0/0.0 if `multi_target` is True.
""" """
predictions, references = list(predictions), list(references) predictions, references = list(predictions), list(references)
assert len(predictions) == len(references) if not multi_target else True, ( assert len(predictions) == len(references) if not multiple_targets else True, (
"predictions and references must have the same length unless `multi_target` is True" "predictions and references must have the same length unless `multiple_targets` is True"
) )
if regexes_to_ignore is not None: if regexes_to_ignore is not None:
...@@ -275,7 +275,7 @@ def exact_match_hf_evaluate( ...@@ -275,7 +275,7 @@ def exact_match_hf_evaluate(
return { return {
"exact_match": np.mean(score_list) "exact_match": np.mean(score_list)
if not multi_target if not multiple_targets
else float(np.any(score_list)) else float(np.any(score_list))
} }
......
from __future__ import annotations from __future__ import annotations
import logging import logging
import warnings from random import Random
from collections.abc import Iterable, Sequence from typing import TYPE_CHECKING
from functools import partial
from typing import TYPE_CHECKING, Any
import datasets
if TYPE_CHECKING: if TYPE_CHECKING:
from random import Random from collections.abc import Iterable, Sequence
from typing import Any, TypeVar
from lm_eval.api.task import ConfigurableTask, Task _T = TypeVar("_T")
eval_logger = logging.getLogger("lm-eval") eval_logger = logging.getLogger(__name__)
class ContextSampler: class ContextSampler:
def __init__( def __init__(
self, self,
docs: list[dict], docs: Sequence[dict[str, Any]] | None = None,
task: Task | ConfigurableTask, *,
fewshot_indices: Iterable | None = None, rnd: int | None = None,
rnd: Random | None = None, fewshot_indices: list[int] | None = None,
**kwargs,
) -> None: ) -> None:
self.rnd = rnd self.rnd = Random(rnd)
if not self.rnd: self.docs = docs or []
raise ValueError( self.fewshot_indices = fewshot_indices
"A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!"
)
self.task = task
self.config = task._config
self.target_delimiter = self.config.target_delimiter
self.fewshot_delimiter = self.config.fewshot_delimiter
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_text", None) is not None
):
self.doc_to_text = partial(
self.task.doc_to_text,
doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
)
else:
self.doc_to_text = self.task.doc_to_text
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_target", None) is not None
):
self.doc_to_target = partial(
self.task.doc_to_target,
doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
)
else:
self.doc_to_target = self.task.doc_to_target
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_choice", None) is not None
):
self.doc_to_choice = partial(
self.task.doc_to_choice,
doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
)
else:
self.doc_to_choice = self.task.doc_to_choice
self.docs = docs # HF dataset split, provided by task._fewshot_docs()
if fewshot_indices: # subset few-shot docs from
if not isinstance(self.docs, datasets.Dataset):
raise ValueError(
"Got `fewshot_indices` but fewshot_docs are not a HF dataset. Don't use both `fewshot_indices` and a user-defined few-shot sample list simultaneously"
)
self.docs = self.docs.select(fewshot_indices)
def get_context(self, doc: dict, num_fewshot: int, gen_prefix: str | None = None):
# draw an extra fewshot sample if using same split as evaluating on
prefix = gen_prefix + " " if gen_prefix else ""
n_samples = (
num_fewshot + 1
if self.config.fewshot_split == self.config.test_split
else num_fewshot
)
# draw `n_samples` docs from fewshot_docs if self.fewshot_indices and self.docs:
fewshotex = self.sample(n_samples) self.docs = [self.docs[i] for i in self.fewshot_indices]
# get rid of the doc that's the one we're evaluating, if it's in the fewshot def sample(
# TODO: should we just stop people from using fewshot from same split as evaluating? self, n: int, doc: dict[str, Any] | None = None, **kwargs
selected_docs = [x for x in fewshotex if x != doc][:num_fewshot] ) -> Sequence[dict]:
labeled_examples = ""
for doc in selected_docs:
doc_content = self.doc_to_text(doc)
doc_target = self.doc_to_target(doc)
if (
self.config.doc_to_choice is None and isinstance(doc_content, str)
) or isinstance(doc_content, str):
labeled_examples += doc_content
else:
if isinstance(doc_content, int):
labeled_examples += self.doc_to_choice(doc)[doc_content]
if doc_target != "":
if self.target_delimiter.isspace() and str(doc_target)[0].isspace():
# TODO: add logger warn once here.
warnings.warn(
"Both target_delimiter and target start with a space. This may cause issues.",
Warning,
stacklevel=2,
)
labeled_examples += self.target_delimiter
labeled_examples += prefix
labeled_examples += (
str(doc_target[0])
if isinstance(doc_target, list)
else doc_target
if self.config.doc_to_choice is None or isinstance(doc_target, str)
else str(self.doc_to_choice(doc)[doc_target])
)
labeled_examples += self.fewshot_delimiter
return labeled_examples
def get_chat_context(
self,
doc: dict,
num_fewshot: int,
fewshot_as_multiturn: bool = False,
gen_prefix: str | None = None,
):
# TODO: Do we need any other delimiter
prefix = gen_prefix + " " if gen_prefix else ""
chat_history = []
# draw an extra fewshot sample if using same split as evaluating on
n_samples = (
num_fewshot + 1
if self.config.fewshot_split == self.config.test_split
else num_fewshot
)
# draw `n_samples` docs from fewshot_docs
fewshotex = self.sample(n_samples)
# get rid of the doc that's the one we're evaluating, if it's in the fewshot
# TODO: should we just stop people from using fewshot from same split as evaluating?
selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
if fewshot_as_multiturn:
for doc in selected_docs:
doc_content = self.doc_to_text(doc)
doc_target = self.doc_to_target(doc)
chat_history.append(
{
"role": "user",
"content": doc_content
if self.config.doc_to_choice is None
or isinstance(doc_content, str)
else self.doc_to_choice(doc)[doc_content],
}
)
chat_history.append(
{
"role": "assistant",
"content": prefix + str(doc_target[0])
if isinstance(doc_target, list)
else prefix + doc_target
if self.config.doc_to_choice is None
or isinstance(doc_target, str)
else prefix + str(self.doc_to_choice(doc)[doc_target]),
}
)
else:
# get fewshot context as one user turn
chat_history.append(
{
"role": "user",
"content": self.get_context(
doc, num_fewshot, gen_prefix=gen_prefix
),
}
)
return chat_history
# @classmethod
# def from_fewshot_dfg(cls, cfg: FewshotConfig):
# if not
def sample(self, n: int) -> Sequence[dict]:
""" """
Draw `n` samples from our fewshot docs. This method should be overridden by subclasses. Sample n documents from the pool.
Args:
n: Number of documents to sample
doc: Optional document to exclude from sampling
Returns:
List of sampled documents
""" """
assert self.rnd is not None, ( if n <= 0:
"Error: `rnd` must be set to a random.Random instance before sampling." return []
return (
self.rnd.sample(self.docs, n)
if not doc
else self.remove_doc(doc, self.rnd.sample(self.docs, n + 1))
) )
return self.rnd.sample(self.docs, n)
def set_rnd(self, rnd: int) -> None:
self.rnd = Random(rnd)
@staticmethod
def remove_doc(doc: _T, _iter: Iterable[_T]) -> list[_T]:
return [x for x in _iter if x != doc]
class FirstNSampler(ContextSampler): class FirstNSampler(ContextSampler):
def sample(self, n: int) -> Sequence[dict[str, Any]]: def sample(self, n: int, doc=None, **kwargs):
""" """
Draw the first `n` samples in order from the specified split. Draw the first `n` samples in order from the specified split.
Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU. Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
...@@ -214,7 +72,7 @@ class FirstNSampler(ContextSampler): ...@@ -214,7 +72,7 @@ class FirstNSampler(ContextSampler):
class BalancedSampler(ContextSampler): class BalancedSampler(ContextSampler):
def sample(self, n: int): def sample(self, n: int, doc=None, **kwargs):
""" """
TODO: this should return approximately class-balanced samples from our fewshot examples. TODO: this should return approximately class-balanced samples from our fewshot examples.
TODO: what order should they be in? maybe random? TODO: what order should they be in? maybe random?
...@@ -224,7 +82,7 @@ class BalancedSampler(ContextSampler): ...@@ -224,7 +82,7 @@ class BalancedSampler(ContextSampler):
class ManualSampler(ContextSampler): class ManualSampler(ContextSampler):
def sample(self, n: int): def sample(self, n: int, doc=None, **kwargs):
""" """ """ """
raise NotImplementedError raise NotImplementedError
......
This diff is collapsed.
...@@ -8,8 +8,8 @@ Homepage: [google-research-datasets/natural-questions@master/nq_open](https://gi ...@@ -8,8 +8,8 @@ Homepage: [google-research-datasets/natural-questions@master/nq_open](https://gi
Paper: [aclanthology.org/P19-1612](https://aclanthology.org/P19-1612/) Paper: [aclanthology.org/P19-1612](https://aclanthology.org/P19-1612/)
Derived from the Natural Questions dataset, introduced in https://storage.googleapis.com/gweb-research2023-media/pubtools/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf . Derived from the Natural Questions dataset, introduced
in https://storage.googleapis.com/gweb-research2023-media/pubtools/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf .
### Citation ### Citation
...@@ -26,4 +26,5 @@ journal = {Transactions of the Association of Computational Linguistics}} ...@@ -26,4 +26,5 @@ journal = {Transactions of the Association of Computational Linguistics}}
* `nq_open` * `nq_open`
### Changelog ### Changelog
* 2025-07-21: Added `multi_target` to `exact_match`. Scores should not change.
* 2025-07-21: Added `multiple_targets` to `exact_match`. Scores should not change.
...@@ -5,7 +5,7 @@ training_split: train ...@@ -5,7 +5,7 @@ training_split: train
validation_split: validation validation_split: validation
description: "Answer these questions:\n\n" description: "Answer these questions:\n\n"
doc_to_text: "Q: {{question}}?\nA:" doc_to_text: "Q: {{question}}?\nA:"
doc_to_target: "{{answer}}" doc_to_target: answer
fewshot_delimiter: "\n" fewshot_delimiter: "\n"
generation_kwargs: generation_kwargs:
until: until:
...@@ -27,7 +27,7 @@ metric_list: ...@@ -27,7 +27,7 @@ metric_list:
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
regexes_to_ignore: regexes_to_ignore:
- "\\b(?:The |the |An |A |The |a |an )" - "\\b(?:The |the |An |A |The |a |an )"
multi_target: true multiple_targets: true
metadata: metadata:
version: 4.0 version: 4.0
...@@ -79,3 +79,6 @@ If other tasks on this dataset are already supported: ...@@ -79,3 +79,6 @@ If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted? * [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
### Changelog
- 2025-07-22: `record` and `multirc`: set target_delimiter to "" and trim doc_to_text respectively.
tag: tag:
- super-glue-lm-eval-v1 - super-glue-lm-eval-v1
task: boolq task: boolq
dataset_path: super_glue dataset_path: aps/super_glue
dataset_name: boolq dataset_name: boolq
output_type: multiple_choice output_type: multiple_choice
training_split: train training_split: train
......
tag: tag:
- super-glue-lm-eval-v1-seq2seq - super-glue-lm-eval-v1-seq2seq
task: "boolq-seq2seq" task: "boolq-seq2seq"
dataset_path: super_glue dataset_path: aps/super_glue
dataset_name: boolq dataset_name: boolq
output_type: generate_until output_type: generate_until
training_split: train training_split: train
validation_split: validation validation_split: validation
doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:" doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
doc_to_target: label doc_to_target: "{{ [' no', ' yes'][label|int] }}"
doc_to_choice: [' no', ' yes']
target_delimiter: "" target_delimiter: ""
generation_kwargs: generation_kwargs:
until: until:
......
tag: tag:
- super-glue-t5-prompt - super-glue-t5-prompt
task: super_glue-boolq-t5-prompt task: super_glue-boolq-t5-prompt
dataset_path: super_glue dataset_path: aps/super_glue
dataset_name: boolq dataset_name: boolq
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: generate_until output_type: generate_until
doc_to_text: "boolq passage: {{passage}} question: {{question}}" doc_to_text: "boolq passage: {{passage}} question: {{question}}"
doc_to_target: label doc_to_target: "{{['False', 'True'][label|int]}}"
doc_to_choice: ['False', 'True']
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
......
tag: tag:
- super-glue-lm-eval-v1 - super-glue-lm-eval-v1
task: cb task: cb
dataset_path: super_glue dataset_path: aps/super_glue
dataset_name: cb dataset_name: cb
output_type: multiple_choice output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:" doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:"
doc_to_target: label doc_to_target: label
doc_to_choice: ['True', 'False', 'Neither'] doc_to_choice: ["True", "False", "Neither"]
metric_list: metric_list:
- metric: acc - metric: acc
- metric: f1 - metric: f1
......
tag: tag:
- super-glue-t5-prompt - super-glue-t5-prompt
task: super_glue-cb-t5-prompt task: super_glue-cb-t5-prompt
dataset_path: super_glue dataset_path: aps/super_glue
dataset_name: cb dataset_name: cb
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: generate_until output_type: generate_until
doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}" doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
doc_to_target: label doc_to_target: "{{ ['entailment', 'contradiction', 'neutral'][label|int] }}"
doc_to_choice: ['entailment', 'contradiction', 'neutral']
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
......
tag: tag:
- super-glue-lm-eval-v1 - super-glue-lm-eval-v1
task: copa task: copa
dataset_path: super_glue dataset_path: aps/super_glue
dataset_name: copa dataset_name: copa
output_type: multiple_choice output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
doc_to_text: !function utils.doc_to_text doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target doc_to_target: label
doc_to_choice: !function utils.doc_to_choice doc_to_choice: ["{{ choice1 }}", "{{ choice2 }}"]
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
......
tag: tag:
- super-glue-t5-prompt - super-glue-t5-prompt
task: super_glue-copa-t5-prompt task: super_glue-copa-t5-prompt
dataset_path: super_glue dataset_path: aps/super_glue
dataset_name: copa dataset_name: copa
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: generate_until output_type: generate_until
doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}" doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
doc_to_target: label doc_to_target: "{{ [choice1, choice2][label|int] }}"
doc_to_choice: ['choice1', 'choice2']
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
......
tag: tag:
- super-glue-lm-eval-v1 - super-glue-lm-eval-v1
task: multirc task: multirc
dataset_path: super_glue dataset_path: aps/super_glue
dataset_name: multirc dataset_name: multirc
output_type: multiple_choice output_type: multiple_choice
training_split: train training_split: train
......
tag: tag:
- super-glue-t5-prompt - super-glue-t5-prompt
task: super_glue-multirc-t5-prompt task: super_glue-multirc-t5-prompt
dataset_path: super_glue dataset_path: aps/super_glue
dataset_name: multirc dataset_name: multirc
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: generate_until output_type: generate_until
doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}" doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}|trim"
doc_to_target: label doc_to_target: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True'][label]}}"
doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}"
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
......
tag: tag:
- super-glue-lm-eval-v1 - super-glue-lm-eval-v1
task: record task: record
dataset_path: super_glue dataset_path: aps/super_glue
dataset_name: record dataset_name: record
output_type: multiple_choice output_type: multiple_choice
training_split: train training_split: train
...@@ -11,6 +11,7 @@ doc_to_target: !function util.doc_to_target ...@@ -11,6 +11,7 @@ doc_to_target: !function util.doc_to_target
doc_to_choice: !function util.doc_to_choice doc_to_choice: !function util.doc_to_choice
process_docs: !function util.process_docs process_docs: !function util.process_docs
process_results: !function util.process_results process_results: !function util.process_results
target_delimiter: ""
metric_list: metric_list:
- metric: f1 - metric: f1
aggregation: mean aggregation: mean
......
tag: tag:
- super-glue-t5-prompt - super-glue-t5-prompt
task: super_glue-record-t5-prompt task: super_glue-record-t5-prompt
dataset_path: super_glue dataset_path: aps/super_glue
dataset_name: record dataset_name: record
validation_split: validation validation_split: validation
output_type: generate_until output_type: generate_until
......
...@@ -19,7 +19,7 @@ def format_answer(query, entity): ...@@ -19,7 +19,7 @@ def format_answer(query, entity):
def doc_to_target(doc): def doc_to_target(doc):
# We only output the first correct entity in a doc # We only output the first correct entity in a doc
return format_answer(query=doc["query"], entity=doc["answers"][0]) return doc["entities"].index(doc["answers"][0])
def doc_to_choice(doc): def doc_to_choice(doc):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment