Add new dataset MMLU-SR tasks (#2032)

* add mmlusr tasks * renamed all tasks names in mmlusr * edit format and readme * added mmlu_sr * mmlu_sr -> mmlusr * update --------- Co-authored-by: lintangsutawika <lintang@eleuther.ai>

Add new dataset MMLU-SR tasks (#2032)
* add mmlusr tasks * renamed all tasks names in mmlusr * edit format and readme * added mmlu_sr * mmlu_sr -> mmlusr * update --------- Co-authored-by: lintangsutawika <lintang@eleuther.ai>
d5f39bf8 · SuperCat · GitHub · cdd954f9 · d5f39bf8 · d5f39bf8
Unverified Commit d5f39bf8 authored Jul 12, 2024 by SuperCat Committed by GitHub Jul 12, 2024
20 changed files
--- a/lm_eval/tasks/mmlusr/answer_only/answer_only_world_religions.yaml
+++ b/lm_eval/tasks/mmlusr/answer_only/answer_only_world_religions.yaml
+"dataset_name": "answer_only_world_religions"
+"description": "The following are multiple choice questions (with answers) about world\
+  \ religions.\n\n"
+"tag": "mmlusr_answer_only_humanities_tasks"
+"include": "_mmlusr_a_yml"
+"task": "mmlusr_answer_only_world_religions"
+"task_alias": "world religions"
--- a/lm_eval/tasks/mmlusr/answer_only/utils.py
+++ b/lm_eval/tasks/mmlusr/answer_only/utils.py
+import datasets
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _helper(doc):
+        # Assuming that the 'answer' field in the dataset now contains numbers 0-3 instead of 'A', 'B', 'C', 'D'
+        answer_list = ["A", "B", "C", "D"]
+        # Convert numeric index to corresponding letter
+        answer_index = int(doc["answer"])  # Make sure the answer is an integer
+        answer_letter = answer_list[answer_index]
+
+        out_doc = {
+            "questions": doc["question"],
+            "choices": [doc["choice1"], doc["choice2"], doc["choice3"], doc["choice4"]],
+            "answer": answer_letter,  # Include the letter for clarity
+        }
+        return out_doc
+
+    return dataset.map(_helper)
--- a/lm_eval/tasks/mmlusr/config.py
+++ b/lm_eval/tasks/mmlusr/config.py
+"""
+Take in a YAML, and output all "other" splits with this YAML
+"""
+
+import argparse
+import logging
+import os
+
+import yaml
+from tqdm import tqdm
+
+
+eval_logger = logging.getLogger("lm-eval")
+
+
+SUBJECTS = {
+    "abstract_algebra": "stem",
+    "anatomy": "stem",
+    "astronomy": "stem",
+    "business_ethics": "other",
+    "clinical_knowledge": "other",
+    "college_biology": "stem",
+    "college_chemistry": "stem",
+    "college_computer_science": "stem",
+    "college_mathematics": "stem",
+    "college_medicine": "other",
+    "college_physics": "stem",
+    "computer_security": "stem",
+    "conceptual_physics": "stem",
+    "econometrics": "social_sciences",
+    "electrical_engineering": "stem",
+    "elementary_mathematics": "stem",
+    "formal_logic": "humanities",
+    "global_facts": "other",
+    "high_school_biology": "stem",
+    "high_school_chemistry": "stem",
+    "high_school_computer_science": "stem",
+    "high_school_european_history": "humanities",
+    "high_school_geography": "social_sciences",
+    "high_school_government_and_politics": "social_sciences",
+    "high_school_macroeconomics": "social_sciences",
+    "high_school_mathematics": "stem",
+    "high_school_microeconomics": "social_sciences",
+    "high_school_physics": "stem",
+    "high_school_psychology": "social_sciences",
+    "high_school_statistics": "stem",
+    "high_school_us_history": "humanities",
+    "high_school_world_history": "humanities",
+    "human_aging": "other",
+    "human_sexuality": "social_sciences",
+    "international_law": "humanities",
+    "jurisprudence": "humanities",
+    "logical_fallacies": "humanities",
+    "machine_learning": "stem",
+    "management": "other",
+    "marketing": "other",
+    "medical_genetics": "other",
+    "miscellaneous": "other",
+    "moral_disputes": "humanities",
+    "moral_scenarios": "humanities",
+    "nutrition": "other",
+    "philosophy": "humanities",
+    "prehistory": "humanities",
+    "professional_accounting": "other",
+    "professional_law": "humanities",
+    "professional_medicine": "other",
+    "professional_psychology": "social_sciences",
+    "public_relations": "social_sciences",
+    "security_studies": "social_sciences",
+    "sociology": "social_sciences",
+    "us_foreign_policy": "social_sciences",
+    "virology": "other",
+    "world_religions": "humanities",
+}
+
+GROUPS = ["question_and_answer"]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Generate configuration YAML files for LM Evaluation Harness."
+    )
+    # Path to the base YAML file from which to inherit settings
+    parser.add_argument(
+        "--base_yaml_path",
+        required=True,
+        help="Path to the base YAML configuration file.",
+    )
+
+    # Directory where the generated YAML files will be saved
+    parser.add_argument(
+        "--save_dir",
+        default="/data/local/cat/lm-evaluation-harness/lm_eval/tasks/mmlusr/question_and_answer",
+    )
+
+    # Optional prefix to add to task names in the YAML files
+    parser.add_argument("--task_prefix", default="")
+
+    parser.add_argument("--cot_prompt_path", default=None)
+
+    # Optional prefix to add to group names in the YAML files
+    parser.add_argument("--group_prefix", default="")
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # Load base YAML configuration
+    base_yaml_name = os.path.basename(args.base_yaml_path)
+    with open(args.base_yaml_path, "r", encoding="utf-8") as f:
+        base_yaml = yaml.full_load(f)
+
+    if args.cot_prompt_path is not None:
+        import json
+
+        with open(args.cot_prompt_path, encoding="utf-8") as f:
+            cot_file = json.load(f)
+
+    for group in GROUPS:
+        for subject, category in tqdm(SUBJECTS.items()):
+            if args.cot_prompt_path is not None:
+                description = cot_file[subject]
+            else:
+                description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
+
+            yaml_dict = {
+                "include": base_yaml_name,
+                "tag": f"mmlusr_{args.group_prefix}{group}_{category}"
+                if args.group_prefix
+                else f"mmlusr_{group}_{category}",
+                "task": f"mmlusr_{args.task_prefix}{group}_{subject}"
+                if args.task_prefix
+                else f"mmlusr_{group}_{subject}",
+                "task_alias": subject.replace("_", " "),
+                "description": description,
+                "dataset_name": f"{group}_{subject}",
+            }
+
+            # File path for saving the generated YAML file
+            file_save_path = os.path.join(args.save_dir, f"{group}_{subject}.yaml")
+            with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+                yaml.dump(yaml_dict, yaml_file, allow_unicode=True, default_style='"')
+            eval_logger.info(f"Saved YAML for {group} {subject} to {file_save_path}")
+
+    # Save group configuration if specified
+    if args.group_prefix:
+        file_save_path = os.path.join(
+            args.save_prefix_path, args.group_prefix + ".yaml"
+        )
+        eval_logger.info(f"Saving benchmark config to {file_save_path}")
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+            yaml.dump(yaml_dict, yaml_file, indent=4, default_flow_style=False)
--- a/lm_eval/tasks/mmlusr/question_and_answer/_mmlusr_qna_yml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/_mmlusr_qna_yml
+dataset_path: NiniCat/MMLU-SR
+test_split: test
+fewshot_split: train
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/mmlusr/question_and_answer/_question_and_answer.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/_question_and_answer.yaml
+group: mmlusr
+group_alias: MMLU-SR (Question & Answer)
+task:
+  - group: mmlusr_qa_stem
+    group_alias: STEM (Question & Answer)
+    task:
+      - mmlusr_question_and_answer_stem_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+  - group: mmlusr_qa_other
+    group_alias: Other (Question & Answer)
+    task:
+      - mmlusr_question_and_answer_other_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+  - group: mmlusr_qa_social_sciences
+    group_alias: Social Sciences (Question & Answer)
+    task:
+      - mmlusr_question_and_answer_social_sciences_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+  - group: mmlusr_qa_humanities
+    group_alias: Humanities (Question & Answer)
+    task:
+      - mmlusr_question_and_answer_humanities_tasks
+    aggregate_metric_list:
+      - metric: acc
+        weight_by_size: True
+    metadata:
+      version: 1
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 1
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_abstract_algebra.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_abstract_algebra.yaml
+"dataset_name": "question_and_answer_abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_abstract_algebra"
+"task_alias": "abstract algebra"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_anatomy.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_anatomy.yaml
+"dataset_name": "question_and_answer_anatomy"
+"description": "The following are multiple choice questions (with answers) about anatomy.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_anatomy"
+"task_alias": "anatomy"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_astronomy.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_astronomy.yaml
+"dataset_name": "question_and_answer_astronomy"
+"description": "The following are multiple choice questions (with answers) about astronomy.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_astronomy"
+"task_alias": "astronomy"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_business_ethics.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_business_ethics.yaml
+"dataset_name": "question_and_answer_business_ethics"
+"description": "The following are multiple choice questions (with answers) about business\
+  \ ethics.\n\n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_business_ethics"
+"task_alias": "business ethics"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_clinical_knowledge.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_clinical_knowledge.yaml
+"dataset_name": "question_and_answer_clinical_knowledge"
+"description": "The following are multiple choice questions (with answers) about clinical\
+  \ knowledge.\n\n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_clinical_knowledge"
+"task_alias": "clinical knowledge"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_biology.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_biology.yaml
+"dataset_name": "question_and_answer_college_biology"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ biology.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_college_biology"
+"task_alias": "college biology"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_chemistry.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_chemistry.yaml
+"dataset_name": "question_and_answer_college_chemistry"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ chemistry.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_college_chemistry"
+"task_alias": "college chemistry"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_computer_science.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_computer_science.yaml
+"dataset_name": "question_and_answer_college_computer_science"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ computer science.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_college_computer_science"
+"task_alias": "college computer science"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_mathematics.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_mathematics.yaml
+"dataset_name": "question_and_answer_college_mathematics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ mathematics.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_college_mathematics"
+"task_alias": "college mathematics"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_medicine.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_medicine.yaml
+"dataset_name": "question_and_answer_college_medicine"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ medicine.\n\n"
+"tag": "mmlusr_question_and_answer_other_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_college_medicine"
+"task_alias": "college medicine"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_physics.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_physics.yaml
+"dataset_name": "question_and_answer_college_physics"
+"description": "The following are multiple choice questions (with answers) about college\
+  \ physics.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_college_physics"
+"task_alias": "college physics"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_computer_security.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_computer_security.yaml
+"dataset_name": "question_and_answer_computer_security"
+"description": "The following are multiple choice questions (with answers) about computer\
+  \ security.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_computer_security"
+"task_alias": "computer security"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_conceptual_physics.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_conceptual_physics.yaml
+"dataset_name": "question_and_answer_conceptual_physics"
+"description": "The following are multiple choice questions (with answers) about conceptual\
+  \ physics.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_conceptual_physics"
+"task_alias": "conceptual physics"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_econometrics.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_econometrics.yaml
+"dataset_name": "question_and_answer_econometrics"
+"description": "The following are multiple choice questions (with answers) about econometrics.\n\
+  \n"
+"tag": "mmlusr_question_and_answer_social_sciences_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_econometrics"
+"task_alias": "econometrics"
--- a/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_electrical_engineering.yaml
+++ b/lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_electrical_engineering.yaml
+"dataset_name": "question_and_answer_electrical_engineering"
+"description": "The following are multiple choice questions (with answers) about electrical\
+  \ engineering.\n\n"
+"tag": "mmlusr_question_and_answer_stem_tasks"
+"include": "_mmlusr_qna_yml"
+"task": "mmlusr_question_and_answer_electrical_engineering"
+"task_alias": "electrical engineering"