Commit 17396935 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged

parents cd8642e7 458342e2
from functools import partial
import datasets
......@@ -15,9 +17,38 @@ class ContextSampler:
self.target_delimiter = self.config.target_delimiter
self.fewshot_delimiter = self.config.fewshot_delimiter
self.doc_to_text = self.task.doc_to_text
self.doc_to_target = self.task.doc_to_target
self.doc_to_choice = self.task.doc_to_choice
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_text", None) is not None
):
self.doc_to_text = partial(
self.task.doc_to_text,
doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
)
else:
self.doc_to_text = self.task.doc_to_text
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_target", None) is not None
):
self.doc_to_target = partial(
self.task.doc_to_target,
doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
)
else:
self.doc_to_target = self.task.doc_to_target
if (
self.config.fewshot_config is not None
and self.config.fewshot_config.get("doc_to_choice", None) is not None
):
self.doc_to_choice = partial(
self.task.doc_to_choice,
doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
)
else:
self.doc_to_choice = self.task.doc_to_choice
self.docs = docs # HF dataset split, provided by task._fewshot_docs()
if fewshot_indices: # subset few-shot docs from
......@@ -52,6 +83,7 @@ class ContextSampler:
else self.doc_to_choice(doc)[doc_content]
)
labeled_examples += self.target_delimiter
<<<<<<< HEAD
labeled_examples += (
str(doc_target[0])
if isinstance(doc_target, list)
......@@ -60,6 +92,17 @@ class ContextSampler:
else str(self.doc_to_choice(doc)[doc_target])
)
labeled_examples += self.fewshot_delimiter
=======
if doc_target != "":
labeled_examples += (
str(doc_target[0])
if isinstance(doc_target, list)
else doc_target
if self.config.doc_to_choice is None or isinstance(doc_target, str)
else str(self.doc_to_choice(doc)[doc_target])
)
labeled_examples += self.fewshot_delimiter
>>>>>>> mmlu-pro-changes
return labeled_examples
......
......@@ -1171,9 +1171,11 @@ class ConfigurableTask(Task):
"""
return doc
def doc_to_text(self, doc):
def doc_to_text(self, doc, doc_to_text=None):
if self.prompt is not None:
doc_to_text = self.prompt
elif doc_to_text is not None:
doc_to_text = doc_to_text
else:
doc_to_text = self.config.doc_to_text
......@@ -1205,9 +1207,11 @@ class ConfigurableTask(Task):
print(type(doc_to_text))
raise TypeError
def doc_to_target(self, doc: Mapping) -> Union[int, str, list]:
def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
if self.prompt is not None:
doc_to_target = self.prompt
elif doc_to_target is not None:
doc_to_target = doc_to_target
else:
doc_to_target = self.config.doc_to_target
......@@ -1249,9 +1253,11 @@ class ConfigurableTask(Task):
else:
raise TypeError
def doc_to_choice(self, doc: Any) -> List[str]:
def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
if self.prompt is not None:
doc_to_choice = self.prompt
elif doc_to_choice is not None:
doc_to_choice = doc_to_choice
elif self.config.doc_to_choice is None:
eval_logger.error("doc_to_choice was called but not set in config")
else:
......
......@@ -607,6 +607,78 @@ def evaluate(
_higher_is_better[m] = None
higher_is_better[group] = _higher_is_better
<<<<<<< HEAD
=======
# collect all metric keys used by a subtask in the group.
metric_list = list(
{
key
for task in task_list
for key in results[task].keys()
if "_stderr" not in key and key not in ["alias", "samples"]
}
)
for metric in metric_list:
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
# compute group's pooled metric and stderr
results[group][metric] = (
lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][stderr] = (
lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
results[group]["samples"] = sum(sizes)
results_agg = defaultdict(dict)
groups_agg = defaultdict(dict)
all_tasks_list = list(task_hierarchy.keys())
while True:
add_tasks_list = list(k for k in results_agg.keys())
left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
if len(left_tasks_list) == 0:
break
_task_hierarchy = {
k: v for k, v in task_hierarchy.items() if k in left_tasks_list
}
_results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
results_agg = {**results_agg, **_results_agg}
groups_agg = {**groups_agg, **_groups_agg}
for group_name, task_list in task_hierarchy.items():
if task_list:
num_fewshot[group_name] = num_fewshot[
task_list[0]
] # TODO: validate this
>>>>>>> mmlu-pro-changes
results_dict = {
"results": dict(results_agg.items()),
**(
......
......@@ -17,7 +17,7 @@ Homepage (preprocessed): https://huggingface.co/datasets/sjyuxyz/MMLU-Pro-with-s
```bibtex
@misc{wang2024mmlupro,
title={MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark},
title={MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark},
author={Yubo Wang and Xueguang Ma and Ge Zhang and Yuansheng Ni and Abhranil Chandra and Shiguang Guo and Weiming Ren and Aaran Arulraj and Xuan He and Ziyan Jiang and Tianle Li and Max Ku and Kai Wang and Alex Zhuang and Rongqi Fan and Xiang Yue and Wenhu Chen},
year={2024},
eprint={2406.01574},
......@@ -31,19 +31,24 @@ Homepage (preprocessed): https://huggingface.co/datasets/sjyuxyz/MMLU-Pro-with-s
#### Groups
* `mmlu_pro`: 'All 14 subjects of the mmlu_pro dataset, evaluated following the methodology in mmlu's original implementation'
* `mmlu_pro_flan_cot_fewshot`: 'mmlu_pro_flan_cot_fewshot includes 5-shot of exemplars for chain-of-thought approach'
* `mmlu_pro_flan_cot_zeroshot`: 'mmlu_pro_flan_cot_zeroshot evaluates using zero-shot chain-of-thought approach'
* `mmlu_pro_generative`: 'mmlu_pro_generative solves questions of mmlu_pro using direct (generative) approach'
* `mmlu_pro_continuation`: 'mmlu_pro_continuation evaluates the ability to continue and complete a given text'
#### Tasks
The following tasks evaluate subjects in the mmlu_pro dataset
- `mmlu_pro_{subject_english}`
- `mmlu_pro_flan_cot_fewshot_{subject_english}`
- `mmlu_pro_flan_cot_zeroshot_{subject_english}`
- `mmlu_pro_generative_{subject_english}`
- `mmlu_pro_continuation_{subject_english}`
- `mmlu_pro_biology`
- `mmlu_pro_business`
- `mmlu_pro_chemistry`
- `mmlu_pro_computer_science`
- `mmlu_pro_economics`
- `mmlu_pro_engineering`
- `mmlu_pro_health`
- `mmlu_pro_history`
- `mmlu_pro_law`
- `mmlu_pro_math`
- `mmlu_pro_other`
- `mmlu_pro_philosophy`
- `mmlu_pro_physics`
- `mmlu_pro_psychology`
### Checklist
......
dataset_path: TIGER-Lab/MMLU-Pro
test_split: test
fewshot_split: validation
fewshot_config:
sampler: first_n
doc_to_text: !function utils.fewshot_to_text
doc_to_target: ""
output_type: generate_until
doc_to_text: !function utils.doc_to_text
doc_to_target: answer
filter_list:
- name: "custom-extract"
filter:
- function: "regex"
regex_pattern: r"answer is \(?([ABCDEFGHIJ])\)?"
# regex_pattern: r".*[aA]nswer:\s*([A-J])",
- function: "take_first"
generation_kwargs:
until:
- "</s>"
- "Q:"
- "<|im_end|>"
do_sample: false
temperature: 0.0
num_fewshot: 5
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
version: 0.0
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import logging
import os
import yaml
from tqdm import tqdm
eval_logger = logging.getLogger("lm-eval")
SUBJECTS = {
"business": "other",
"law": "humanities",
"psychology": "social_sciences",
"biology": "stem",
"chemistry": "stem",
"history": "humanities",
"other": "other",
"health": "other",
"economics": "social_sciences",
"math": "stem",
"physics": "stem",
"computer_science": "stem",
"philosophy": "humanities",
"engineering": "stem"
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="mmlu_pro")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
parser.add_argument("--group_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path, encoding="utf-8") as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path, encoding="utf-8") as f:
cot_file = json.load(f)
ALL_CATEGORIES = []
for subject, category in tqdm(SUBJECTS.items()):
if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category)
if args.cot_prompt_path is not None:
description = cot_file[subject]
else:
description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = {
"include": base_yaml_name,
"group": f"mmlu_pro_{args.task_prefix}_{category}"
if args.task_prefix != ""
else f"mmlu_pro_{category}",
"group_alias": category.replace("_", " "),
"task": f"mmlu_pro_{args.task_prefix}_{subject}"
if args.task_prefix != ""
else f"mmlu_pro_{subject}",
"task_alias": subject.replace("_", " "),
"dataset_name": subject,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
allow_unicode=True,
default_style='"',
)
if args.task_prefix != "":
mmlu_pro_subcategories = [
f"mmlu_pro_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
]
else:
mmlu_pro_subcategories = [f"mmlu_pro_{category}" for category in ALL_CATEGORIES]
if args.group_prefix != "":
file_save_path = args.group_prefix + ".yaml"
else:
file_save_path = args.save_prefix_path + ".yaml"
eval_logger.info(f"Saving benchmark config to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
{
"group": f"mmlu_pro_{args.task_prefix}"
if args.task_prefix != ""
else "mmlu_pro",
"task": mmlu_pro_subcategories,
},
yaml_file,
indent=4,
default_flow_style=False,
)
"dataset_name": "health"
"description": "The following are multiple choice questions (with answers) about health.\n\
\n"
"group": "mmlu_pro_other"
"group_alias": "other"
"include": "_default_template_yaml"
"task": "mmlu_pro_health"
"task_alias": "health"
description: "The following are multiple choice questions (with answers) about biology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_biology"
task_alias: "biology"
process_docs: !function utils.process_biology
description: "The following are multiple choice questions (with answers) about business. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_business"
task_alias: "business"
process_docs: !function utils.process_business
description: "The following are multiple choice questions (with answers) about chemistry. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_chemistry"
task_alias: "chemistry"
process_docs: !function utils.process_chemistry
description: "The following are multiple choice questions (with answers) about computer science. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_computer_science"
task_alias: "computer_science"
process_docs: !function utils.process_computer_science
description: "The following are multiple choice questions (with answers) about economics. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_economics"
task_alias: "economics"
process_docs: !function utils.process_economics
description: "The following are multiple choice questions (with answers) about engineering. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_engineering"
task_alias: "engineering"
process_docs: !function utils.process_engineering
description: "The following are multiple choice questions (with answers) about health. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_health"
task_alias: "health"
process_docs: !function utils.process_health
description: "The following are multiple choice questions (with answers) about history. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_history"
task_alias: "history"
process_docs: !function utils.process_history
description: "The following are multiple choice questions (with answers) about law. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_law"
task_alias: "law"
process_docs: !function utils.process_law
description: "The following are multiple choice questions (with answers) about math. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_math"
task_alias: "math"
process_docs: !function utils.process_math
description: "The following are multiple choice questions (with answers) about other. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_other"
task_alias: "other"
process_docs: !function utils.process_other
description: "The following are multiple choice questions (with answers) about philosophy. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice."
include: "_default_template_yaml"
task: "mmlu_pro_philosophy"
task_alias: "philosophy"
process_docs: !function utils.process_philosophy
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment