Merge branch 'mmlu-refactorbranch' of...

Merge branch 'mmlu-refactorbranch' of https://github.com/EleutherAI/lm-evaluation-harness into flan-benchmark

Merge branch 'mmlu-refactorbranch' of...
Merge branch 'mmlu-refactorbranch' of https://github.com/EleutherAI/lm-evaluation-harness into flan-benchmark
191458b8 · lintangsutawika · a81ef1a7 · 9b00813f · 191458b8 · 191458b8
Commit 191458b8 authored Sep 02, 2023 by lintangsutawika
3 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -40,6 +40,6 @@ repos:
      - id: codespell
        exclude: >
          (?x)^(
-              .*\.json|ignore.txt
+              .*\.json|ignore.txt|.*\.yaml
          )$
        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
--- a/lm_eval/tasks/mmlu/gen_all_splits.py
+++ b/lm_eval/tasks/mmlu/gen_all_splits.py
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import os
+import yaml
+import argparse
+
+from tqdm import tqdm
+
+from lm_eval import utils
+from lm_eval.logger import eval_logger
+
+SUBJECTS = [
+    # "abstract_algebra",
+    "anatomy",
+    "astronomy",
+    "business_ethics",
+    "clinical_knowledge",
+    "college_biology",
+    "college_chemistry",
+    "college_computer_science",
+    "college_mathematics",
+    "college_medicine",
+    "college_physics",
+    "computer_security",
+    "conceptual_physics",
+    "econometrics",
+    "electrical_engineering",
+    "elementary_mathematics",
+    "formal_logic",
+    "global_facts",
+    "high_school_biology",
+    "high_school_chemistry",
+    "high_school_computer_science",
+    "high_school_european_history",
+    "high_school_geography",
+    "high_school_government_and_politics",
+    "high_school_macroeconomics",
+    "high_school_mathematics",
+    "high_school_microeconomics",
+    "high_school_physics",
+    "high_school_psychology",
+    "high_school_statistics",
+    "high_school_us_history",
+    "high_school_world_history",
+    "human_aging",
+    "human_sexuality",
+    "international_law",
+    "jurisprudence",
+    "logical_fallacies",
+    "machine_learning",
+    "management",
+    "marketing",
+    "medical_genetics",
+    "miscellaneous",
+    "moral_disputes",
+    "moral_scenarios",
+    "nutrition",
+    "philosophy",
+    "prehistory",
+    "professional_accounting",
+    "professional_law",
+    "professional_medicine",
+    "professional_psychology",
+    "public_relations",
+    "security_studies",
+    "sociology",
+    "us_foreign_policy",
+    "virology",
+    "world_religions",
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--benchmark_name", required=True)
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument(
+        "--task_save_path", default="lm_eval/tasks/mmlu/hendrycks_test_original"
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path) as f:
+        base_yaml = yaml.full_load(f)
+    print(base_yaml)
+
+    for subject in tqdm(SUBJECTS):
+
+        yaml_dict = {
+            "include": base_yaml_name,
+            "task": base_yaml["task"].strip("abstract_algebra") + "subject",
+            "dataset_name": subject,
+            "description": f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n",
+        }
+
+        file_save_path = args.task_save_path + f"_{subject}.yaml"
+        eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
+        with open(file_save_path, "w") as yaml_file:
+            yaml.dump(yaml_dict, yaml_file)
--- a/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml
+++ b/lm_eval/tasks/mmlu/hendrycks_test_original_default.yaml
+group:
+  - mmlu
+  - mmlu_original
+  - multiple_choice
+task: mmlu_original_abstract_algebra
+dataset_path: cais/mmlu
+dataset_name: abstract_algebra
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+description: "The following are multiple choice questions (with answers) about abstract algebra.\n\n"
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{answer}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true