Commit 3e3a0d8f authored by Baber's avatar Baber
Browse files

Merge branch 'rm_multiple_target' into metrics

# Conflicts:
#	lm_eval/api/filter.py
#	lm_eval/api/metrics.py
#	lm_eval/api/task.py
#	lm_eval/filters/extraction.py
parents 2b4cdd41 00a77ebd
include: _template_yaml
dataset_name: uzb
task: multiblimp_uzb
include: _template_yaml
dataset_name: vep
task: multiblimp_vep
include: _template_yaml
dataset_name: wbp
task: multiblimp_wbp
include: _template_yaml
dataset_name: wol
task: multiblimp_wol
include: _template_yaml
dataset_name: xcl
task: multiblimp_xcl
include: _template_yaml
dataset_name: xnr
task: multiblimp_xnr
include: _template_yaml
dataset_name: xpg
task: multiblimp_xpg
include: _template_yaml
dataset_name: yrl
task: multiblimp_yrl
......@@ -24,3 +24,6 @@ journal = {Transactions of the Association of Computational Linguistics}}
### Tasks
* `nq_open`
### Changelog
* 2025-07-21: Added `multi_target` to `exact_match`. Scores should not change.
task: nq_open
dataset_path: nq_open
dataset_path: google-research-datasets/nq_open
output_type: generate_until
training_split: train
validation_split: validation
description: "Answer these questions:\n\n"
doc_to_text: "Q: {{question}}?\nA:"
doc_to_target: "{{answer}}" # TODO: should be multi-target
doc_to_target: "{{answer}}"
fewshot_delimiter: "\n"
generation_kwargs:
until:
......@@ -28,5 +28,6 @@ metric_list:
ignore_punctuation: true
regexes_to_ignore:
- "\\b(?:The |the |An |A |The |a |an )"
multi_target: true
metadata:
version: 4.0
......@@ -49,3 +49,6 @@ If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
### Changelog
* 2025-07-21: Added `multi_target` to `exact_match`. Scores should not change.
task: triviaqa
dataset_path: trivia_qa
dataset_path: mandarjoshi/trivia_qa
dataset_name: rc.nocontext
output_type: generate_until
training_split: train
......@@ -27,5 +27,6 @@ metric_list:
higher_is_better: true
ignore_case: true
ignore_punctuation: true
multi_target: true
metadata:
version: 3.0
......@@ -7,6 +7,11 @@ from lm_eval.api.task import ConfigurableTask
from tests.test_tasks import BaseTasks, task_class
@pytest.fixture()
def limit() -> int:
return 10
@pytest.mark.parametrize(
"task_class",
task_class(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment