add arabic mmlu (#1402)

* add arabic mmlu * update the description * add readme file

add arabic mmlu (#1402)
* add arabic mmlu * update the description * add readme file
7de7b27e · khalil · GitHub · c1145dfd · 7de7b27e · 7de7b27e
Unverified Commit 7de7b27e authored Feb 26, 2024 by khalil Committed by GitHub Feb 26, 2024
20 changed files
--- a/lm_eval/tasks/ammlu/README.md
+++ b/lm_eval/tasks/ammlu/README.md
+# ArabicMMLU
+### Paper
+ArabicMMLU: Measuring massive multitask language understanding in Arabic
+This dataset has been translated from the original MMLU with the help of GPT-4.
+The original data [MMLU](https://arxiv.org/pdf/2009.03300v3.pdf)
+The translation has been done with AceGPT researchers [AceGPT](https://arxiv.org/abs/2309.12053)
+ArabicMMLU is a comprehensive evaluation benchmark specifically designed to evaluate the knowledge and reasoning abilities of LLMs within the context of Arabic language and culture.
+ArabicMMLU covers a wide range of subjects, comprising 57 topics that span from elementary to advanced professional levels.
+Homepage: [AceGPT Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic)
+### Citation
+### Groups and Tasks
+#### Groups
+- `ammlu`: All 57 subjects of the ArabicMMLU dataset, evaluated following the methodology in MMLU's original implementation.
+#### Tasks
+The following tasks evaluate subjects in the ArabicMMLU dataset using loglikelihood-based multiple-choice scoring:
+- `ammlu_{subject_english}`
+### Checklist
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [x] Yes, original implementation contributed by author of the benchmark
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/ammlu/_default_template_yaml
+++ b/lm_eval/tasks/ammlu/_default_template_yaml
+group: ammlu
+dataset_path: Hennara/ammlu
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالجواب："
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/ammlu/_generate_configs.py
+++ b/lm_eval/tasks/ammlu/_generate_configs.py
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import os
+import yaml
+import argparse
+from tqdm import tqdm
+SUBJECTS = {
+    "abstract_algebra": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "anatomy": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "astronomy": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "business_ethics": "علوم أخرى",
+    "clinical_knowledge": "علوم أخرى",
+    "college_biology": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "college_chemistry": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "college_computer_science": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "college_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "college_medicine": "علوم أخرى",
+    "college_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "computer_security": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "conceptual_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "econometrics": "العلوم الإجتماعية",
+    "electrical_engineering": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "elementary_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "formal_logic": "العلوم الانسانية",
+    "global_facts": "علوم أخرى",
+    "high_school_biology": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "high_school_chemistry": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "high_school_computer_science": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "high_school_european_history": "العلوم الانسانية",
+    "high_school_geography": "العلوم الإجتماعية",
+    "high_school_government_and_politics": "العلوم الإجتماعية",
+    "high_school_macroeconomics": "العلوم الإجتماعية",
+    "high_school_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "high_school_microeconomics": "العلوم الإجتماعية",
+    "high_school_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "high_school_psychology": "العلوم الإجتماعية",
+    "high_school_statistics": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "high_school_us_history": "العلوم الانسانية",
+    "high_school_world_history": "العلوم الانسانية",
+    "human_aging": "علوم أخرى",
+    "human_sexuality": "العلوم الإجتماعية",
+    "international_law": "العلوم الانسانية",
+    "jurisprudence": "العلوم الانسانية",
+    "logical_fallacies": "العلوم الانسانية",
+    "machine_learning": "ألعلوم وتقنية المعلومات و الرياضيات",
+    "management": "علوم أخرى",
+    "marketing": "علوم أخرى",
+    "medical_genetics": "علوم أخرى",
+    "miscellaneous": "علوم أخرى",
+    "moral_disputes": "العلوم الانسانية",
+    "moral_scenarios": "العلوم الانسانية",
+    "nutrition": "علوم أخرى",
+    "philosophy": "العلوم الانسانية",
+    "prehistory": "العلوم الانسانية",
+    "professional_accounting": "علوم أخرى",
+    "professional_law": "العلوم الانسانية",
+    "professional_medicine": "علوم أخرى",
+    "professional_psychology": "العلوم الإجتماعية",
+    "public_relations": "العلوم الإجتماعية",
+    "security_studies": "العلوم الإجتماعية",
+    "sociology": "العلوم الإجتماعية",
+    "us_foreign_policy": "العلوم الإجتماعية",
+    "virology": "علوم أخرى",
+    "world_religions": "العلوم الانسانية",
+}
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument("--save_prefix_path", default="ammlu")
+    parser.add_argument("--cot_prompt_path", default=None)
+    parser.add_argument("--task_prefix", default="")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path, encoding="utf-8") as f:
+        base_yaml = yaml.full_load(f)
+    if args.cot_prompt_path is not None:
+        import json
+        with open(args.cot_prompt_path, encoding="utf-8") as f:
+            cot_file = json.load(f)
+    for subject_eng, category in tqdm(SUBJECTS.items()):
+        if args.cot_prompt_path is not None:
+            description = cot_file[subject_eng]
+        else:
+            description = (
+                f"فم بعملية التقييم في مجال {category} \n\n"
+            )
+        yaml_dict = {
+            "include": base_yaml_name,
+            "task": f"ammlu_{args.task_prefix}_{subject_eng}"
+            if args.task_prefix != ""
+            else f"ammlu_{subject_eng}",
+            "dataset_name": subject_eng,
+            "description": description,
+        }
+        file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
+        print(f"Saving yaml for subset {subject_eng} to {file_save_path}")
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
--- a/lm_eval/tasks/ammlu/ammlu_abstract_algebra.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_abstract_algebra.yaml
+"dataset_name": "abstract_algebra"
+"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_abstract_algebra"
--- a/lm_eval/tasks/ammlu/ammlu_anatomy.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_anatomy.yaml
+"dataset_name": "anatomy"
+"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_anatomy"
--- a/lm_eval/tasks/ammlu/ammlu_astronomy.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_astronomy.yaml
+"dataset_name": "astronomy"
+"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_astronomy"
--- a/lm_eval/tasks/ammlu/ammlu_business_ethics.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_business_ethics.yaml
+"dataset_name": "business_ethics"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_business_ethics"
--- a/lm_eval/tasks/ammlu/ammlu_clinical_knowledge.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_clinical_knowledge.yaml
+"dataset_name": "clinical_knowledge"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_clinical_knowledge"
--- a/lm_eval/tasks/ammlu/ammlu_college_biology.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_biology.yaml
+"dataset_name": "college_biology"
+"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_college_biology"
--- a/lm_eval/tasks/ammlu/ammlu_college_chemistry.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_chemistry.yaml
+"dataset_name": "college_chemistry"
+"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_college_chemistry"
--- a/lm_eval/tasks/ammlu/ammlu_college_computer_science.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_computer_science.yaml
+"dataset_name": "college_computer_science"
+"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_college_computer_science"
--- a/lm_eval/tasks/ammlu/ammlu_college_mathematics.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_mathematics.yaml
+"dataset_name": "college_mathematics"
+"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_college_mathematics"
--- a/lm_eval/tasks/ammlu/ammlu_college_medicine.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_medicine.yaml
+"dataset_name": "college_medicine"
+"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_college_medicine"
--- a/lm_eval/tasks/ammlu/ammlu_college_physics.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_college_physics.yaml
+"dataset_name": "college_physics"
+"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_college_physics"
--- a/lm_eval/tasks/ammlu/ammlu_computer_security.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_computer_security.yaml
+"dataset_name": "computer_security"
+"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_computer_security"
--- a/lm_eval/tasks/ammlu/ammlu_conceptual_physics.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_conceptual_physics.yaml
+"dataset_name": "conceptual_physics"
+"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_conceptual_physics"
--- a/lm_eval/tasks/ammlu/ammlu_econometrics.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_econometrics.yaml
+"dataset_name": "econometrics"
+"description": "فم بعملية التقييم في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_econometrics"
--- a/lm_eval/tasks/ammlu/ammlu_electrical_engineering.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_electrical_engineering.yaml
+"dataset_name": "electrical_engineering"
+"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_electrical_engineering"
--- a/lm_eval/tasks/ammlu/ammlu_elementary_mathematics.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_elementary_mathematics.yaml
+"dataset_name": "elementary_mathematics"
+"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_elementary_mathematics"
--- a/lm_eval/tasks/ammlu/ammlu_formal_logic.yaml
+++ b/lm_eval/tasks/ammlu/ammlu_formal_logic.yaml
+"dataset_name": "formal_logic"
+"description": "فم بعملية التقييم في مجال العلوم الانسانية \n\n"
+"include": "_default_template_yaml"
+"task": "ammlu_formal_logic"