Unverified Commit 8ad598df authored by Jungwhan Kim's avatar Jungwhan Kim Committed by GitHub
Browse files

keep new line for task description (#2116)



* add keep trailing newline

* apply ruff-format

* add prompt unit test

* increment the version of tasks that have description with whitespace

* remove white spaces of leaderboard bbh

* update MMLU expected versions in output

* CI run does display the expected version=1 for mmlu subtasks, fix expected test output again

---------
Co-authored-by: default avatarhaileyschoelkopf <hailey@eleuther.ai>
parent 0571eeb1
...@@ -29,4 +29,4 @@ aggregate_metric_list: ...@@ -29,4 +29,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -12,6 +12,6 @@ metric_list: ...@@ -12,6 +12,6 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 2.0
dataset_kwargs: dataset_kwargs:
trust_remote_code: true trust_remote_code: true
...@@ -15,6 +15,6 @@ metric_list: ...@@ -15,6 +15,6 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 2.0
dataset_kwargs: dataset_kwargs:
trust_remote_code: true trust_remote_code: true
...@@ -29,4 +29,4 @@ aggregate_metric_list: ...@@ -29,4 +29,4 @@ aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -9,7 +9,7 @@ task: ...@@ -9,7 +9,7 @@ task:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
- group: mmlusr_ao_other - group: mmlusr_ao_other
group_alias: Other (Answer Only) group_alias: Other (Answer Only)
task: task:
...@@ -18,7 +18,7 @@ task: ...@@ -18,7 +18,7 @@ task:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
- group: mmlusr_ao_social_sciences - group: mmlusr_ao_social_sciences
group_alias: Social Sciences (Answer Only) group_alias: Social Sciences (Answer Only)
task: task:
...@@ -27,7 +27,7 @@ task: ...@@ -27,7 +27,7 @@ task:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
- group: mmlusr_ao_humanities - group: mmlusr_ao_humanities
group_alias: Humanities (Answer Only) group_alias: Humanities (Answer Only)
task: task:
...@@ -36,9 +36,9 @@ task: ...@@ -36,9 +36,9 @@ task:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
aggregate_metric_list: aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -13,4 +13,4 @@ metric_list: ...@@ -13,4 +13,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.0 version: 1.0
...@@ -13,4 +13,4 @@ metric_list: ...@@ -13,4 +13,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.0 version: 1.0
...@@ -9,7 +9,7 @@ task: ...@@ -9,7 +9,7 @@ task:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
- group: mmlusr_qa_other - group: mmlusr_qa_other
group_alias: Other (Question & Answer) group_alias: Other (Question & Answer)
task: task:
...@@ -18,7 +18,7 @@ task: ...@@ -18,7 +18,7 @@ task:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
- group: mmlusr_qa_social_sciences - group: mmlusr_qa_social_sciences
group_alias: Social Sciences (Question & Answer) group_alias: Social Sciences (Question & Answer)
task: task:
...@@ -27,7 +27,7 @@ task: ...@@ -27,7 +27,7 @@ task:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
- group: mmlusr_qa_humanities - group: mmlusr_qa_humanities
group_alias: Humanities (Question & Answer) group_alias: Humanities (Question & Answer)
task: task:
...@@ -36,9 +36,9 @@ task: ...@@ -36,9 +36,9 @@ task:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
aggregate_metric_list: aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -13,4 +13,4 @@ metric_list: ...@@ -13,4 +13,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.0 version: 1.0
...@@ -9,7 +9,7 @@ task: ...@@ -9,7 +9,7 @@ task:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
- group: mmlusr_qo_other - group: mmlusr_qo_other
group_alias: Other (Question Only) group_alias: Other (Question Only)
task: task:
...@@ -18,7 +18,7 @@ task: ...@@ -18,7 +18,7 @@ task:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
- group: mmlusr_qo_social_sciences - group: mmlusr_qo_social_sciences
group_alias: Social Sciences (Question Only) group_alias: Social Sciences (Question Only)
task: task:
...@@ -27,7 +27,7 @@ task: ...@@ -27,7 +27,7 @@ task:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
- group: mmlusr_qo_humanities - group: mmlusr_qo_humanities
group_alias: Humanities (Question Only) group_alias: Humanities (Question Only)
task: task:
...@@ -36,9 +36,9 @@ task: ...@@ -36,9 +36,9 @@ task:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
aggregate_metric_list: aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 1 version: 2
...@@ -29,4 +29,4 @@ metric_list: ...@@ -29,4 +29,4 @@ metric_list:
regexes_to_ignore: regexes_to_ignore:
- "\\b(?:The |the |An |A |The |a |an )" - "\\b(?:The |the |An |A |The |a |an )"
metadata: metadata:
version: 3.0 version: 4.0
...@@ -16,4 +16,4 @@ metric_list: ...@@ -16,4 +16,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.1 version: 1.0
...@@ -12,4 +12,4 @@ metric_list: ...@@ -12,4 +12,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0 version: 1
...@@ -8,4 +8,4 @@ aggregate_metric_list: ...@@ -8,4 +8,4 @@ aggregate_metric_list:
aggregation: mean aggregation: mean
weight_by_size: True weight_by_size: True
metadata: metadata:
version: 0 version: 1
...@@ -471,7 +471,9 @@ def regex_replace(string, pattern, repl, count: int = 0): ...@@ -471,7 +471,9 @@ def regex_replace(string, pattern, repl, count: int = 0):
return re.sub(pattern, repl, string, count=count) return re.sub(pattern, repl, string, count=count)
env = Environment(loader=BaseLoader, undefined=StrictUndefined) env = Environment(
loader=BaseLoader, undefined=StrictUndefined, keep_trailing_newline=True
)
env.filters["regex_replace"] = regex_replace env.filters["regex_replace"] = regex_replace
......
import random
from typing import List
import numpy as np
import pytest
from lm_eval import tasks
from lm_eval.tasks import TaskManager
from lm_eval.utils import join_iters
MMLU_ANATOMY_ZERO_SHOT = """The following are multiple choice questions (with answers) about anatomy.
A lesion causing compression of the facial nerve at the stylomastoid foramen will cause ipsilateral
A. paralysis of the facial muscles.
B. paralysis of the facial muscles and loss of taste.
C. paralysis of the facial muscles, loss of taste and lacrimation.
D. paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.
Answer:"""
MMLU_ANATOMY_FIVE_SHOT = """The following are multiple choice questions (with answers) about anatomy.
What is the embryological origin of the hyoid bone?
A. The first pharyngeal arch
B. The first and second pharyngeal arches
C. The second pharyngeal arch
D. The second and third pharyngeal arches
Answer: D
Which of these branches of the trigeminal nerve contain somatic motor processes?
A. The supraorbital nerve
B. The infraorbital nerve
C. The mental nerve
D. None of the above
Answer: D
The pleura
A. have no sensory innervation.
B. are separated by a 2 mm space.
C. extend into the neck.
D. are composed of respiratory epithelium.
Answer: C
In Angle's Class II Div 2 occlusion there is
A. excess overbite of the upper lateral incisors.
B. negative overjet of the upper central incisors.
C. excess overjet of the upper lateral incisors.
D. excess overjet of the upper central incisors.
Answer: C
Which of the following is the body cavity that contains the pituitary gland?
A. Abdominal
B. Cranial
C. Pleural
D. Spinal
Answer: B
A lesion causing compression of the facial nerve at the stylomastoid foramen will cause ipsilateral
A. paralysis of the facial muscles.
B. paralysis of the facial muscles and loss of taste.
C. paralysis of the facial muscles, loss of taste and lacrimation.
D. paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.
Answer:"""
@pytest.mark.parametrize(
"task_names,sets,num_fewshot,seed,num_examples,expected_prompt",
[
(["mmlu_anatomy"], "test", 0, 42, 1, MMLU_ANATOMY_ZERO_SHOT),
(["mmlu_anatomy"], "test", 5, 42, 1, MMLU_ANATOMY_FIVE_SHOT),
],
)
def test_mmlu_prompt_rendering(
task_names: List[str],
sets: str,
num_fewshot: int,
seed: int,
num_examples: int,
expected_prompt: str,
):
np.random.seed(seed)
task_manager = TaskManager()
task_dict = tasks.get_task_dict(task_names, task_manager)
for task_name, task in task_dict.items():
if isinstance(task, tuple):
_, task = task
rnd = random.Random()
rnd.seed(seed)
iters = []
for set in sets.split(","):
docs = None
if set == "train" and task.has_training_docs():
docs = task.training_docs()
if set == "val" and task.has_validation_docs():
docs = task.validation_docs()
if set == "test" and task.has_test_docs():
docs = task.test_docs()
if docs is not None:
iters.append(docs)
if len(iters) == 0:
raise ValueError
docs = join_iters(iters)
for i, doc in (
zip(range(num_examples), docs) if num_examples > 0 else enumerate(docs)
):
ctx = task.fewshot_context(
doc=doc,
num_fewshot=num_fewshot,
)
assert ctx == expected_prompt
| Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| | Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|
|-------------------------------|------:|------|-----:|------|---|-----:|---|------| |-------------------------------|------:|------|-----:|------|---|-----:|---|------|
|stem | 1|none | |acc |↑ |0.2474|± | N/A| |stem | 2|none | |acc |↑ |0.2474|± | N/A|
| - abstract_algebra | 0|none | 0|acc |↑ |0.2000|± | N/A| | - abstract_algebra | 1|none | 0|acc |↑ |0.2000|± | N/A|
| - anatomy | 0|none | 0|acc |↑ |0.3000|± | N/A| | - anatomy | 1|none | 0|acc |↑ |0.3000|± | N/A|
| - astronomy | 0|none | 0|acc |↑ |0.1000|± | N/A| | - astronomy | 1|none | 0|acc |↑ |0.1000|± | N/A|
| - college_biology | 0|none | 0|acc |↑ |0.3000|± | N/A| | - college_biology | 1|none | 0|acc |↑ |0.3000|± | N/A|
| - college_chemistry | 0|none | 0|acc |↑ |0.1000|± | N/A| | - college_chemistry | 1|none | 0|acc |↑ |0.1000|± | N/A|
| - college_computer_science | 0|none | 0|acc |↑ |0.2000|± | N/A| | - college_computer_science | 1|none | 0|acc |↑ |0.2000|± | N/A|
| - college_mathematics | 0|none | 0|acc |↑ |0.2000|± | N/A| | - college_mathematics | 1|none | 0|acc |↑ |0.2000|± | N/A|
| - college_physics | 0|none | 0|acc |↑ |0.3000|± | N/A| | - college_physics | 1|none | 0|acc |↑ |0.3000|± | N/A|
| - computer_security | 0|none | 0|acc |↑ |0.5000|± | N/A| | - computer_security | 1|none | 0|acc |↑ |0.5000|± | N/A|
| - conceptual_physics | 0|none | 0|acc |↑ |0.3000|± | N/A| | - conceptual_physics | 1|none | 0|acc |↑ |0.3000|± | N/A|
| - electrical_engineering | 0|none | 0|acc |↑ |0.4000|± | N/A| | - electrical_engineering | 1|none | 0|acc |↑ |0.4000|± | N/A|
| - elementary_mathematics | 0|none | 0|acc |↑ |0.0000|± | N/A| | - elementary_mathematics | 1|none | 0|acc |↑ |0.0000|± | N/A|
| - high_school_biology | 0|none | 0|acc |↑ |0.3000|± | N/A| | - high_school_biology | 1|none | 0|acc |↑ |0.3000|± | N/A|
| - high_school_chemistry | 0|none | 0|acc |↑ |0.4000|± | N/A| | - high_school_chemistry | 1|none | 0|acc |↑ |0.4000|± | N/A|
| - high_school_computer_science| 0|none | 0|acc |↑ |0.3000|± | N/A| | - high_school_computer_science| 1|none | 0|acc |↑ |0.3000|± | N/A|
| - high_school_mathematics | 0|none | 0|acc |↑ |0.2000|± | N/A| | - high_school_mathematics | 1|none | 0|acc |↑ |0.2000|± | N/A|
| - high_school_physics | 0|none | 0|acc |↑ |0.3000|± | N/A| | - high_school_physics | 1|none | 0|acc |↑ |0.3000|± | N/A|
| - high_school_statistics | 0|none | 0|acc |↑ |0.0000|± | N/A| | - high_school_statistics | 1|none | 0|acc |↑ |0.0000|± | N/A|
| - machine_learning | 0|none | 0|acc |↑ |0.3000|± | N/A| | - machine_learning | 1|none | 0|acc |↑ |0.3000|± | N/A|
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment