Commit 17396935 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged

parents cd8642e7 458342e2
from functools import partial
import datasets import datasets
...@@ -15,8 +17,37 @@ class ContextSampler: ...@@ -15,8 +17,37 @@ class ContextSampler:
self.target_delimiter = self.config.target_delimiter self.target_delimiter = self.config.target_delimiter
self.fewshot_delimiter = self.config.fewshot_delimiter self.fewshot_delimiter = self.config.fewshot_delimiter
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_text", None) is not None
):
self.doc_to_text = partial(
self.task.doc_to_text,
doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
)
else:
self.doc_to_text = self.task.doc_to_text self.doc_to_text = self.task.doc_to_text
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_target", None) is not None
):
self.doc_to_target = partial(
self.task.doc_to_target,
doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
)
else:
self.doc_to_target = self.task.doc_to_target self.doc_to_target = self.task.doc_to_target
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_choice", None) is not None
):
self.doc_to_choice = partial(
self.task.doc_to_choice,
doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
)
else:
self.doc_to_choice = self.task.doc_to_choice self.doc_to_choice = self.task.doc_to_choice
self.docs = docs # HF dataset split, provided by task._fewshot_docs() self.docs = docs # HF dataset split, provided by task._fewshot_docs()
...@@ -52,6 +83,7 @@ class ContextSampler: ...@@ -52,6 +83,7 @@ class ContextSampler:
else self.doc_to_choice(doc)[doc_content] else self.doc_to_choice(doc)[doc_content]
) )
labeled_examples += self.target_delimiter labeled_examples += self.target_delimiter
<<<<<<< HEAD
labeled_examples += ( labeled_examples += (
str(doc_target[0]) str(doc_target[0])
if isinstance(doc_target, list) if isinstance(doc_target, list)
...@@ -60,6 +92,17 @@ class ContextSampler: ...@@ -60,6 +92,17 @@ class ContextSampler:
else str(self.doc_to_choice(doc)[doc_target]) else str(self.doc_to_choice(doc)[doc_target])
) )
labeled_examples += self.fewshot_delimiter labeled_examples += self.fewshot_delimiter
=======
if doc_target != "":
labeled_examples += (
str(doc_target[0])
if isinstance(doc_target, list)
else doc_target
if self.config.doc_to_choice is None or isinstance(doc_target, str)
else str(self.doc_to_choice(doc)[doc_target])
)
labeled_examples += self.fewshot_delimiter
>>>>>>> mmlu-pro-changes
return labeled_examples return labeled_examples
......
...@@ -1171,9 +1171,11 @@ class ConfigurableTask(Task): ...@@ -1171,9 +1171,11 @@ class ConfigurableTask(Task):
""" """
return doc return doc
def doc_to_text(self, doc): def doc_to_text(self, doc, doc_to_text=None):
if self.prompt is not None: if self.prompt is not None:
doc_to_text = self.prompt doc_to_text = self.prompt
elif doc_to_text is not None:
doc_to_text = doc_to_text
else: else:
doc_to_text = self.config.doc_to_text doc_to_text = self.config.doc_to_text
...@@ -1205,9 +1207,11 @@ class ConfigurableTask(Task): ...@@ -1205,9 +1207,11 @@ class ConfigurableTask(Task):
print(type(doc_to_text)) print(type(doc_to_text))
raise TypeError raise TypeError
def doc_to_target(self, doc: Mapping) -> Union[int, str, list]: def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
if self.prompt is not None: if self.prompt is not None:
doc_to_target = self.prompt doc_to_target = self.prompt
elif doc_to_target is not None:
doc_to_target = doc_to_target
else: else:
doc_to_target = self.config.doc_to_target doc_to_target = self.config.doc_to_target
...@@ -1249,9 +1253,11 @@ class ConfigurableTask(Task): ...@@ -1249,9 +1253,11 @@ class ConfigurableTask(Task):
else: else:
raise TypeError raise TypeError
def doc_to_choice(self, doc: Any) -> List[str]: def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
if self.prompt is not None: if self.prompt is not None:
doc_to_choice = self.prompt doc_to_choice = self.prompt
elif doc_to_choice is not None:
doc_to_choice = doc_to_choice
elif self.config.doc_to_choice is None: elif self.config.doc_to_choice is None:
eval_logger.error("doc_to_choice was called but not set in config") eval_logger.error("doc_to_choice was called but not set in config")
else: else:
......
...@@ -607,6 +607,78 @@ def evaluate( ...@@ -607,6 +607,78 @@ def evaluate(
_higher_is_better[m] = None _higher_is_better[m] = None
higher_is_better[group] = _higher_is_better higher_is_better[group] = _higher_is_better
<<<<<<< HEAD
=======
# collect all metric keys used by a subtask in the group.
metric_list = list(
{
key
for task in task_list
for key in results[task].keys()
if "_stderr" not in key and key not in ["alias", "samples"]
}
)
for metric in metric_list:
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
# compute group's pooled metric and stderr
results[group][metric] = (
lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][stderr] = (
lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
results[group]["samples"] = sum(sizes)
results_agg = defaultdict(dict)
groups_agg = defaultdict(dict)
all_tasks_list = list(task_hierarchy.keys())
while True:
add_tasks_list = list(k for k in results_agg.keys())
left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
if len(left_tasks_list) == 0:
break
_task_hierarchy = {
k: v for k, v in task_hierarchy.items() if k in left_tasks_list
}
_results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
results_agg = {**results_agg, **_results_agg}
groups_agg = {**groups_agg, **_groups_agg}
for group_name, task_list in task_hierarchy.items():
if task_list:
num_fewshot[group_name] = num_fewshot[
task_list[0]
] # TODO: validate this
>>>>>>> mmlu-pro-changes
results_dict = { results_dict = {
"results": dict(results_agg.items()), "results": dict(results_agg.items()),
**( **(
......
...@@ -31,19 +31,24 @@ Homepage (preprocessed): https://huggingface.co/datasets/sjyuxyz/MMLU-Pro-with-s ...@@ -31,19 +31,24 @@ Homepage (preprocessed): https://huggingface.co/datasets/sjyuxyz/MMLU-Pro-with-s
#### Groups #### Groups
* `mmlu_pro`: 'All 14 subjects of the mmlu_pro dataset, evaluated following the methodology in mmlu's original implementation' * `mmlu_pro`: 'All 14 subjects of the mmlu_pro dataset, evaluated following the methodology in mmlu's original implementation'
* `mmlu_pro_flan_cot_fewshot`: 'mmlu_pro_flan_cot_fewshot includes 5-shot of exemplars for chain-of-thought approach'
* `mmlu_pro_flan_cot_zeroshot`: 'mmlu_pro_flan_cot_zeroshot evaluates using zero-shot chain-of-thought approach'
* `mmlu_pro_generative`: 'mmlu_pro_generative solves questions of mmlu_pro using direct (generative) approach'
* `mmlu_pro_continuation`: 'mmlu_pro_continuation evaluates the ability to continue and complete a given text'
#### Tasks #### Tasks
The following tasks evaluate subjects in the mmlu_pro dataset The following tasks evaluate subjects in the mmlu_pro dataset
- `mmlu_pro_{subject_english}` - `mmlu_pro_biology`
- `mmlu_pro_flan_cot_fewshot_{subject_english}` - `mmlu_pro_business`
- `mmlu_pro_flan_cot_zeroshot_{subject_english}` - `mmlu_pro_chemistry`
- `mmlu_pro_generative_{subject_english}` - `mmlu_pro_computer_science`
- `mmlu_pro_continuation_{subject_english}` - `mmlu_pro_economics`
- `mmlu_pro_engineering`
- `mmlu_pro_health`
- `mmlu_pro_history`
- `mmlu_pro_law`
- `mmlu_pro_math`
- `mmlu_pro_other`
- `mmlu_pro_philosophy`
- `mmlu_pro_physics`
- `mmlu_pro_psychology`
### Checklist ### Checklist
......
dataset_path: TIGER-Lab/MMLU-Pro
test_split: test
fewshot_split: validation
fewshot_config:
sampler: first_n
doc_to_text: !function utils.fewshot_to_text
doc_to_target: ""
output_type: generate_until
doc_to_text: !function utils.doc_to_text
doc_to_target: answer
filter_list:
- name: "custom-extract"
filter:
- function: "regex"
regex_pattern: r"answer is \(?([ABCDEFGHIJ])\)?"
# regex_pattern: r".*[aA]nswer:\s*([A-J])",
- function: "take_first"
generation_kwargs:
until:
- "</s>"
- "Q:"
- "<|im_end|>"
do_sample: false
temperature: 0.0
num_fewshot: 5
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
version: 0.0
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import logging
import os
import yaml
from tqdm import tqdm
eval_logger = logging.getLogger("lm-eval")
SUBJECTS = {
"business": "other",
"law": "humanities",
"psychology": "social_sciences",
"biology": "stem",
"chemistry": "stem",
"history": "humanities",
"other": "other",
"health": "other",
"economics": "social_sciences",
"math": "stem",
"physics": "stem",
"computer_science": "stem",
"philosophy": "humanities",
"engineering": "stem"
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="mmlu_pro")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
parser.add_argument("--group_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path, encoding="utf-8") as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path, encoding="utf-8") as f:
cot_file = json.load(f)
ALL_CATEGORIES = []
for subject, category in tqdm(SUBJECTS.items()):
if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category)
if args.cot_prompt_path is not None:
description = cot_file[subject]
else:
description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = {
"include": base_yaml_name,
"group": f"mmlu_pro_{args.task_prefix}_{category}"
if args.task_prefix != ""
else f"mmlu_pro_{category}",
"group_alias": category.replace("_", " "),
"task": f"mmlu_pro_{args.task_prefix}_{subject}"
if args.task_prefix != ""
else f"mmlu_pro_{subject}",
"task_alias": subject.replace("_", " "),
"dataset_name": subject,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
allow_unicode=True,
default_style='"',
)
if args.task_prefix != "":
mmlu_pro_subcategories = [
f"mmlu_pro_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
]
else:
mmlu_pro_subcategories = [f"mmlu_pro_{category}" for category in ALL_CATEGORIES]
if args.group_prefix != "":
file_save_path = args.group_prefix + ".yaml"
else:
file_save_path = args.save_prefix_path + ".yaml"
eval_logger.info(f"Saving benchmark config to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
{
"group": f"mmlu_pro_{args.task_prefix}"
if args.task_prefix != ""
else "mmlu_pro",
"task": mmlu_pro_subcategories,
},
yaml_file,
indent=4,
default_flow_style=False,
)
"dataset_name": "health"
"description": "The following are multiple choice questions (with answers) about health.\n\
\n"
"group": "mmlu_pro_other"
"group_alias": "other"
"include": "_default_template_yaml"
"task": "mmlu_pro_health"
"task_alias": "health"
description: "The following are multiple choice questions (with answers) about biology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_biology"
task_alias: "biology"
process_docs: !function utils.process_biology
description: "The following are multiple choice questions (with answers) about business. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_business"
task_alias: "business"
process_docs: !function utils.process_business
description: "The following are multiple choice questions (with answers) about chemistry. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_chemistry"
task_alias: "chemistry"
process_docs: !function utils.process_chemistry
description: "The following are multiple choice questions (with answers) about computer science. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_computer_science"
task_alias: "computer_science"
process_docs: !function utils.process_computer_science
description: "The following are multiple choice questions (with answers) about economics. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_economics"
task_alias: "economics"
process_docs: !function utils.process_economics
description: "The following are multiple choice questions (with answers) about engineering. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_engineering"
task_alias: "engineering"
process_docs: !function utils.process_engineering
description: "The following are multiple choice questions (with answers) about health. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_health"
task_alias: "health"
process_docs: !function utils.process_health
description: "The following are multiple choice questions (with answers) about history. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_history"
task_alias: "history"
process_docs: !function utils.process_history
description: "The following are multiple choice questions (with answers) about law. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_law"
task_alias: "law"
process_docs: !function utils.process_law
description: "The following are multiple choice questions (with answers) about math. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_math"
task_alias: "math"
process_docs: !function utils.process_math
description: "The following are multiple choice questions (with answers) about other. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_other"
task_alias: "other"
process_docs: !function utils.process_other
description: "The following are multiple choice questions (with answers) about philosophy. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_philosophy"
task_alias: "philosophy"
process_docs: !function utils.process_philosophy
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment