Commit 3e3a0d8f authored by Baber's avatar Baber
Browse files

Merge branch 'rm_multiple_target' into metrics

# Conflicts:
#	lm_eval/api/filter.py
#	lm_eval/api/metrics.py
#	lm_eval/api/task.py
#	lm_eval/filters/extraction.py
parents 2b4cdd41 00a77ebd
include: _template_yaml
dataset_name: uzb
task: multiblimp_uzb
include: _template_yaml
dataset_name: vep
task: multiblimp_vep
include: _template_yaml
dataset_name: wbp
task: multiblimp_wbp
include: _template_yaml
dataset_name: wol
task: multiblimp_wol
include: _template_yaml
dataset_name: xcl
task: multiblimp_xcl
include: _template_yaml
dataset_name: xnr
task: multiblimp_xnr
include: _template_yaml
dataset_name: xpg
task: multiblimp_xpg
include: _template_yaml
dataset_name: yrl
task: multiblimp_yrl
...@@ -24,3 +24,6 @@ journal = {Transactions of the Association of Computational Linguistics}} ...@@ -24,3 +24,6 @@ journal = {Transactions of the Association of Computational Linguistics}}
### Tasks ### Tasks
* `nq_open` * `nq_open`
### Changelog
* 2025-07-21: Added `multi_target` to `exact_match`. Scores should not change.
task: nq_open task: nq_open
dataset_path: nq_open dataset_path: google-research-datasets/nq_open
output_type: generate_until output_type: generate_until
training_split: train training_split: train
validation_split: validation validation_split: validation
description: "Answer these questions:\n\n" description: "Answer these questions:\n\n"
doc_to_text: "Q: {{question}}?\nA:" doc_to_text: "Q: {{question}}?\nA:"
doc_to_target: "{{answer}}" # TODO: should be multi-target doc_to_target: "{{answer}}"
fewshot_delimiter: "\n" fewshot_delimiter: "\n"
generation_kwargs: generation_kwargs:
until: until:
...@@ -28,5 +28,6 @@ metric_list: ...@@ -28,5 +28,6 @@ metric_list:
ignore_punctuation: true ignore_punctuation: true
regexes_to_ignore: regexes_to_ignore:
- "\\b(?:The |the |An |A |The |a |an )" - "\\b(?:The |the |An |A |The |a |an )"
multi_target: true
metadata: metadata:
version: 4.0 version: 4.0
...@@ -49,3 +49,6 @@ If other tasks on this dataset are already supported: ...@@ -49,3 +49,6 @@ If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted? * [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
### Changelog
* 2025-07-21: Added `multi_target` to `exact_match`. Scores should not change.
task: triviaqa task: triviaqa
dataset_path: trivia_qa dataset_path: mandarjoshi/trivia_qa
dataset_name: rc.nocontext dataset_name: rc.nocontext
output_type: generate_until output_type: generate_until
training_split: train training_split: train
...@@ -27,5 +27,6 @@ metric_list: ...@@ -27,5 +27,6 @@ metric_list:
higher_is_better: true higher_is_better: true
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
multi_target: true
metadata: metadata:
version: 3.0 version: 3.0
...@@ -7,6 +7,11 @@ from lm_eval.api.task import ConfigurableTask ...@@ -7,6 +7,11 @@ from lm_eval.api.task import ConfigurableTask
from tests.test_tasks import BaseTasks, task_class from tests.test_tasks import BaseTasks, task_class
@pytest.fixture()
def limit() -> int:
return 10
@pytest.mark.parametrize( @pytest.mark.parametrize(
"task_class", "task_class",
task_class( task_class(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment