Commit 191458b8 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'mmlu-refactorbranch' of...

Merge branch 'mmlu-refactorbranch' of https://github.com/EleutherAI/lm-evaluation-harness into flan-benchmark
parents a81ef1a7 9b00813f
......@@ -40,6 +40,6 @@ repos:
- id: codespell
exclude: >
(?x)^(
.*\.json|ignore.txt
.*\.json|ignore.txt|.*\.yaml
)$
args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
"""
Take in a YAML, and output all other splits with this YAML
"""
import os
import yaml
import argparse
from tqdm import tqdm
from lm_eval import utils
from lm_eval.logger import eval_logger
SUBJECTS = [
# "abstract_algebra",
"anatomy",
"astronomy",
"business_ethics",
"clinical_knowledge",
"college_biology",
"college_chemistry",
"college_computer_science",
"college_mathematics",
"college_medicine",
"college_physics",
"computer_security",
"conceptual_physics",
"econometrics",
"electrical_engineering",
"elementary_mathematics",
"formal_logic",
"global_facts",
"high_school_biology",
"high_school_chemistry",
"high_school_computer_science",
"high_school_european_history",
"high_school_geography",
"high_school_government_and_politics",
"high_school_macroeconomics",
"high_school_mathematics",
"high_school_microeconomics",
"high_school_physics",
"high_school_psychology",
"high_school_statistics",
"high_school_us_history",
"high_school_world_history",
"human_aging",
"human_sexuality",
"international_law",
"jurisprudence",
"logical_fallacies",
"machine_learning",
"management",
"marketing",
"medical_genetics",
"miscellaneous",
"moral_disputes",
"moral_scenarios",
"nutrition",
"philosophy",
"prehistory",
"professional_accounting",
"professional_law",
"professional_medicine",
"professional_psychology",
"public_relations",
"security_studies",
"sociology",
"us_foreign_policy",
"virology",
"world_religions",
]
def parse_args():
parser = argparse.ArgumentParser()
# parser.add_argument("--benchmark_name", required=True)
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument(
"--task_save_path", default="lm_eval/tasks/mmlu/hendrycks_test_original"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
print(base_yaml)
for subject in tqdm(SUBJECTS):
yaml_dict = {
"include": base_yaml_name,
"task": base_yaml["task"].strip("abstract_algebra") + "subject",
"dataset_name": subject,
"description": f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n",
}
file_save_path = args.task_save_path + f"_{subject}.yaml"
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(yaml_dict, yaml_file)
group:
- mmlu
- mmlu_original
- multiple_choice
task: mmlu_original_abstract_algebra
dataset_path: cais/mmlu
dataset_name: abstract_algebra
output_type: multiple_choice
validation_split: validation
test_split: test
description: "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{answer}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment