Merge branch 'main' into longcxt

2b56339e · Baber · 0b533339 · 703fbffd · 2b56339e · 2b56339e
Commit 2b56339e authored Jan 17, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_social-science_political-science.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_social-science_political-science.yaml
+"dataset_name": "univ_social-science_political-science"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_social-science_egy"
+"task": "AraDiCE_ArabicMMLU_univ_social-science_political-science_egy"
+"task_alias": "univ social-science political-science"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_stem_computer-science.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/AraDiCE_ArabicMMLU_univ_stem_computer-science.yaml
+"dataset_name": "univ_stem_computer-science"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_stem_egy"
+"task": "AraDiCE_ArabicMMLU_univ_stem_computer-science_egy"
+"task_alias": "univ stem computer-science"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/_default_template_yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/_default_template_yaml
+dataset_path: "QCRI/AraDICE-ArabicMMLU-egy"
+fewshot_config:
+  sampler: default
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "{{prompt}}"
+doc_to_choice: choices
+doc_to_target: target
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/metrics.py
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/metrics.py
+from sklearn.metrics import f1_score
+
+
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+
+
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+
+
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/ArabicMMLU/EGY/utils.py
+++ b/lm_eval/tasks/aradice/ArabicMMLU/EGY/utils.py
+level_ar = {
+    "Primary": "للمرحلة الابتدائية",
+    "Middle": "للمرحلة المتوسطة",
+    "High": "للمرحلة الثانوية",
+    "Univ": "للمرحلة الجامعية ",
+    "Prof": "للمحترفين",
+}
+
+country_ar = {
+    "UAE": "في الإمارات",
+    "Egypt": "في مصر",
+    "Lebanon": "في لبنان",
+    "Jordan": "في الأردن",
+    "Kuwait": "في الكويت",
+    "KSA": "في السعودية",
+    "Palestine": "في فلسطين",
+    "Morocco": "في المغرب",
+}
+
+subject_ar = {
+    "Islamic Studies": "في الدراسات إسلامية",
+    "Driving Test": "في اختبار القيادة",
+    "Natural Science": "في العلوم الطبيعية",
+    "History": "في مادة التاريخ",
+    "General Knowledge": "في المعرفة العامة",
+    "Law": "في القانون",
+    "Physics": "في الفيزياء",
+    "Social Science": "في العلوم الاجتماعية",
+    "Management": "في الإدارة",
+    "Arabic Language": "في اللغة العربية",
+    "Political Science": " في العلوم السياسية",
+    "Philosophy": "في الفلسفة",
+    "Accounting": "في المحاسبة",
+    "Computer Science": "في علوم الحاسوب",
+    "Geography": "في الجغرافيا",
+    "Math": "في الرياضيات",
+    "Biology": "في علم الأحياء",
+    "Economics": "في الاقتصاد",
+    "Arabic Language (General)": "في اللغة العربية (عام)",
+    "Arabic Language (Grammar)": "في اللغة العربية (النحو)",
+    "Civics": "في التربية المدنية",
+}
+
+
+alpa_ar = ["أ-", "ب-", "ج-", "د-", "و-"]
+alpa_en = ["A-", "B-", "C-", "D-", "E-"]
+all_choices = ["أ", "ب", "ج", "د", "و"]
+all_choices_en = ["A", "B", "C", "D", "E"]
+
+
+def process_docs(dataset):
+    def _helper(doc):
+        # modifies the contents of a single
+        # document in our dataset.
+
+        PROMPT = "ده سؤال [MAIN_META_DATA]. اختار الإجابة الصحيحة!\n\nسؤال: [INPUT]\n[OPTION]"
+        PROMPT = f"{PROMPT}\n\nإجابة:"
+        alpa = alpa_ar
+        subject = subject_ar[doc["Subject"]]
+        level = " " + level_ar[doc["Level"]] if doc["Level"] else ""
+        country = " " + country_ar[doc["Country"]] if doc["Country"] else ""
+        main_meta_data = f"{subject}{level}{country}"
+
+        question = (
+            f"{doc['context']}\n\n{doc['question']}"
+            if doc["context"]
+            else doc["question"]
+        )
+        options = []
+        for i, opt in enumerate(["A", "B", "C", "D", "E"]):
+            if opt not in doc["options"] or doc["options"][opt] is None:
+                break
+            options.append(f"{alpa[i]} {doc['options'][opt]}")
+
+        doc["prompt"] = (
+            PROMPT.replace("[MAIN_META_DATA]", main_meta_data)
+            .replace("[INPUT]", question)
+            .replace("[OPTION]", "\n".join(options))
+        )
+
+        doc["choices"] = all_choices[: len(options)]
+
+        doc["target"] = ["A", "B", "C", "D", "E"].index(doc["Answer Key"])
+
+        return doc
+
+    return dataset.map(_helper)  # returns back a datasets.Dataset object
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU.yaml
+group: AraDiCE_ArabicMMLU_lev
+task:
+- AraDiCE_ArabicMMLU_humanities_lev
+- AraDiCE_ArabicMMLU_language_lev
+- AraDiCE_ArabicMMLU_social-science_lev
+- AraDiCE_ArabicMMLU_stem_lev
+- AraDiCE_ArabicMMLU_other_lev
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+  - metric: acc_norm
+    weight_by_size: True
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_history.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_history.yaml
+"dataset_name": "high_humanities_history"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_humanities_lev"
+"task": "AraDiCE_ArabicMMLU_high_humanities_history_lev"
+"task_alias": "high humanities history"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_islamic-studies.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_islamic-studies.yaml
+"dataset_name": "high_humanities_islamic-studies"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_humanities_lev"
+"task": "AraDiCE_ArabicMMLU_high_humanities_islamic-studies_lev"
+"task_alias": "high humanities islamic-studies"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_philosophy.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_humanities_philosophy.yaml
+"dataset_name": "high_humanities_philosophy"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_humanities_lev"
+"task": "AraDiCE_ArabicMMLU_high_humanities_philosophy_lev"
+"task_alias": "high humanities philosophy"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_language_arabic-language.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_language_arabic-language.yaml
+"dataset_name": "high_language_arabic-language"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_language_lev"
+"task": "AraDiCE_ArabicMMLU_high_language_arabic-language_lev"
+"task_alias": "high language arabic-language"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_civics.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_civics.yaml
+"dataset_name": "high_social-science_civics"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_social-science_lev"
+"task": "AraDiCE_ArabicMMLU_high_social-science_civics_lev"
+"task_alias": "high social-science civics"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_economics.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_economics.yaml
+"dataset_name": "high_social-science_economics"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_social-science_lev"
+"task": "AraDiCE_ArabicMMLU_high_social-science_economics_lev"
+"task_alias": "high social-science economics"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_geography.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_social-science_geography.yaml
+"dataset_name": "high_social-science_geography"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_social-science_lev"
+"task": "AraDiCE_ArabicMMLU_high_social-science_geography_lev"
+"task_alias": "high social-science geography"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_biology.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_biology.yaml
+"dataset_name": "high_stem_biology"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_stem_lev"
+"task": "AraDiCE_ArabicMMLU_high_stem_biology_lev"
+"task_alias": "high stem biology"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_computer-science.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_computer-science.yaml
+"dataset_name": "high_stem_computer-science"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_stem_lev"
+"task": "AraDiCE_ArabicMMLU_high_stem_computer-science_lev"
+"task_alias": "high stem computer-science"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_physics.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_high_stem_physics.yaml
+"dataset_name": "high_stem_physics"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_stem_lev"
+"task": "AraDiCE_ArabicMMLU_high_stem_physics_lev"
+"task_alias": "high stem physics"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_humanities_history.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_humanities_history.yaml
+"dataset_name": "middle_humanities_history"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_humanities_lev"
+"task": "AraDiCE_ArabicMMLU_middle_humanities_history_lev"
+"task_alias": "middle humanities history"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_humanities_islamic-studies.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_humanities_islamic-studies.yaml
+"dataset_name": "middle_humanities_islamic-studies"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_humanities_lev"
+"task": "AraDiCE_ArabicMMLU_middle_humanities_islamic-studies_lev"
+"task_alias": "middle humanities islamic-studies"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_language_arabic-language.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_language_arabic-language.yaml
+"dataset_name": "middle_language_arabic-language"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_language_lev"
+"task": "AraDiCE_ArabicMMLU_middle_language_arabic-language_lev"
+"task_alias": "middle language arabic-language"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_other_general-knowledge.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_middle_other_general-knowledge.yaml
+"dataset_name": "middle_other_general-knowledge"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_other_lev"
+"task": "AraDiCE_ArabicMMLU_middle_other_general-knowledge_lev"
+"task_alias": "middle other general-knowledge"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"