AraDICE task config file (#2507)

* added aradice * Added ArabicMMLU Lev Configs * added ArabicMMLU egy configs * Added boolq configs * Added cultural bench configs * added openbookqa configs * Added PiQA configs * added winogrande configs * Added truthfulQA configs * Added aradice group config * Remove deleted files from repository * modified arabimmlu configs * modified metadata versions * fixed formatting using ruff * added aradice tasks information * pre-commit * Uptaded openbookqa utils * fixed formatting on obqa --------- Co-authored-by: Basel Mousi <bmousi@hbku.edu.qa> Co-authored-by: Baber <baber@hey.com>

AraDICE task config file (#2507)
* added aradice * Added ArabicMMLU Lev Configs * added ArabicMMLU egy configs * Added boolq configs * Added cultural bench configs * added openbookqa configs * Added PiQA configs * added winogrande configs * Added truthfulQA configs * Added aradice group config * Remove deleted files from repository * modified arabimmlu configs * modified metadata versions * fixed formatting using ruff * added aradice tasks information * pre-commit * Uptaded openbookqa utils * fixed formatting on obqa --------- Co-authored-by: Basel Mousi <bmousi@hbku.edu.qa> Co-authored-by: Baber <baber@hey.com>
932e8f9e · Firoj Alam, Scientist, QCRI · GitHub · b86aa213 · 932e8f9e · 932e8f9e
Unverified Commit 932e8f9e authored Dec 24, 2024 by Firoj Alam, Scientist, QCRI Committed by GitHub Dec 24, 2024
20 changed files
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_prof_humanities_law.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_prof_humanities_law.yaml
+"dataset_name": "prof_humanities_law"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_humanities_lev"
+"task": "AraDiCE_ArabicMMLU_prof_humanities_law_lev"
+"task_alias": "prof humanities law"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_other_management.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_other_management.yaml
+"dataset_name": "univ_other_management"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_other_lev"
+"task": "AraDiCE_ArabicMMLU_univ_other_management_lev"
+"task_alias": "univ other management"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_accounting.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_accounting.yaml
+"dataset_name": "univ_social-science_accounting"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_social-science_lev"
+"task": "AraDiCE_ArabicMMLU_univ_social-science_accounting_lev"
+"task_alias": "univ social-science accounting"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_economics.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_economics.yaml
+"dataset_name": "univ_social-science_economics"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_social-science_lev"
+"task": "AraDiCE_ArabicMMLU_univ_social-science_economics_lev"
+"task_alias": "univ social-science economics"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_political-science.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_social-science_political-science.yaml
+"dataset_name": "univ_social-science_political-science"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_social-science_lev"
+"task": "AraDiCE_ArabicMMLU_univ_social-science_political-science_lev"
+"task_alias": "univ social-science political-science"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_stem_computer-science.yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/AraDiCE_ArabicMMLU_univ_stem_computer-science.yaml
+"dataset_name": "univ_stem_computer-science"
+"description": ""
+"fewshot_split": !!null "null"
+"include": "_default_template_yaml"
+"tag": "AraDiCE_ArabicMMLU_stem_lev"
+"task": "AraDiCE_ArabicMMLU_univ_stem_computer-science_lev"
+"task_alias": "univ stem computer-science"
+"test_split": "test"
+"training_split": !!null "null"
+"validation_split": !!null "null"
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/_default_template_yaml
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/_default_template_yaml
+dataset_path: QCRI/AraDICE-ArabicMMLU-lev
+fewshot_config:
+  sampler: default
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "{{prompt}}"
+doc_to_choice: choices
+doc_to_target: target
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/metrics.py
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/metrics.py
+from sklearn.metrics import f1_score
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/ArabicMMLU/LEV/utils.py
+++ b/lm_eval/tasks/aradice/ArabicMMLU/LEV/utils.py
+level_ar = {
+    "Primary": "للمرحلة الابتدائية",
+    "Middle": "للمرحلة المتوسطة",
+    "High": "للمرحلة الثانوية",
+    "Univ": "للمرحلة الجامعية ",
+    "Prof": "للمحترفين",
+}
+country_ar = {
+    "UAE": "بالإمارات",
+    "Egypt": "بمصر",
+    "Lebanon": "بلبنان",
+    "Jordan": "بالأردن",
+    "Kuwait": "بالكويت",
+    "KSA": "بالسعودية",
+    "Palestine": "بفلسطين",
+    "Morocco": "بالمغرب",
+}
+subject_ar = {
+    "Islamic Studies": "عن الدراسات إسلامية",
+    "Driving Test": "عن فحص السواقة",
+    "Natural Science": "عن العلوم الطبيعية",
+    "History": "تاريخ",
+    "General Knowledge": "معرفة عامة",
+    "Law": "عن القانون",
+    "Physics": "فيزياء",
+    "Social Science": "علوم اجتماعية",
+    "Management": "عن الإدارة",
+    "Arabic Language": "عن اللغة العربية",
+    "Political Science": " عن العلوم السياسية",
+    "Philosophy": "فلسفة",
+    "Accounting": "محاسبة",
+    "Computer Science": "عن علوم الحاسوب",
+    "Geography": "جغرافيا",
+    "Math": "رياضيات",
+    "Biology": "بيولوجي",
+    "Economics": "اقتصاد",
+    "Arabic Language (General)": "لغة العربية (عام)",
+    "Arabic Language (Grammar)": "لغة العربية (نحو)",
+    "Civics": "تربية مدنية",
+}
+alpa_ar = ["أ-", "ب-", "ج-", "د-", "و-"]
+alpa_en = ["A-", "B-", "C-", "D-", "E-"]
+all_choices = ["أ", "ب", "ج", "د", "و"]
+all_choices_en = ["A", "B", "C", "D", "E"]
+def process_docs(dataset):
+    def _helper(doc):
+        # modifies the contents of a single
+        # document in our dataset.
+        PROMPT = (
+            "هيدا سؤال [MAIN_META_DATA]. نقي الجواب الصح!\n\nسؤال: [INPUT]\n[OPTION]"
+        )
+        # if args.lora_weights == "x":
+        PROMPT = f"{PROMPT}\n\nالجواب:"
+        # else:
+        # 	PROMPT = f'### Input:{PROMPT}\n\n### Output:\n'
+        alpa = alpa_ar
+        subject = subject_ar[doc["Subject"]]
+        level = " " + level_ar[doc["Level"]] if doc["Level"] else ""
+        country = " " + country_ar[doc["Country"]] if doc["Country"] else ""
+        main_meta_data = f"{subject}{level}{country}"
+        question = (
+            f"{doc['context']}\n\n{doc['question']}"
+            if doc["context"]
+            else doc["question"]
+        )
+        options = []
+        for i, opt in enumerate(["A", "B", "C", "D", "E"]):
+            if opt not in doc["options"] or doc["options"][opt] is None:
+                break
+            options.append(f"{alpa[i]} {doc['options'][opt]}")
+        doc["prompt"] = (
+            PROMPT.replace("[MAIN_META_DATA]", main_meta_data)
+            .replace("[INPUT]", question)
+            .replace("[OPTION]", "\n".join(options))
+        )
+        doc["choices"] = all_choices[: len(options)]
+        doc["target"] = ["A", "B", "C", "D", "E"].index(doc["Answer Key"])
+        return doc
+    return dataset.map(_helper)
--- a/lm_eval/tasks/aradice/README.md
+++ b/lm_eval/tasks/aradice/README.md
+# AraDiCE
+### Paper
+**Title:** AraDiCE: Benchmarks for Dialectal and Cultural Capabilities in LLMs
+**Abstract:** Arabic, with its rich diversity of dialects, remains significantly underrepresented in Large Language Models, particularly in dialectal variations. We address this gap by introducing seven synthetic datasets in dialects alongside Modern Standard Arabic (MSA), created using Machine Translation (MT) combined with human post-editing. We present AraDiCE, a benchmark for Arabic Dialect and Cultural Evaluation. We evaluate LLMs on dialect comprehension and generation, focusing specifically on low-resource Arabic dialects. Additionally, we introduce the first-ever fine-grained benchmark designed to evaluate cultural awareness across the Gulf, Egypt, and Levant regions, providing a novel dimension to LLM evaluation. Our findings demonstrate that while Arabic-specific models like Jais and AceGPT outperform multilingual models on dialectal tasks, significant challenges persist in dialect identification, generation, and translation. This work contributes ~45K post-edited samples, a cultural benchmark, and highlights the importance of tailored training to improve LLM performance in capturing the nuances of diverse Arabic dialects and cultural contexts. We will release the dialectal translation models and benchmarks curated in this study.
+**Homepage:**
+https://huggingface.co/datasets/QCRI/AraDiCE
+### Citation
+```
+@article{mousi2024aradicebenchmarksdialectalcultural,
+      title={{AraDiCE}: Benchmarks for Dialectal and Cultural Capabilities in LLMs},
+      author={Basel Mousi and Nadir Durrani and Fatema Ahmad and Md. Arid Hasan and Maram Hasanain and Tameem Kabbani and Fahim Dalvi and Shammur Absar Chowdhury and Firoj Alam},
+      year={2024},
+      publisher={arXiv:2409.11404},
+      url={https://arxiv.org/abs/2409.11404},
+}
+```
+### Groups, Tags, and Tasks
+#### Groups
+* `AraDiCE`: Overall results for all tasks associated with different datasets.
+#### Tasks
+* `aradice`: Overall results for all tasks associated with different datasets.
+* `arabicmmlu`: TODO
+### Checklist
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/aradice/aradice.yaml
+++ b/lm_eval/tasks/aradice/aradice.yaml
+group: AraDiCE
+task:
+- AraDiCE_ArabicMMLU_lev
+- AraDiCE_ArabicMMLU_egy
+- AraDiCE_boolq_egy
+- AraDiCE_boolq_eng
+- AraDiCE_boolq_lev
+- AraDiCE_boolq_msa
+- AraDiCE_egypt_cultural
+- AraDiCE_jordan_cultural
+- AraDiCE_lebanon_cultural
+- AraDiCE_palestine_cultural
+- AraDiCE_qatar_cultural
+- AraDiCE_syria_cultural
+- AraDiCE_openbookqa_egy
+- AraDiCE_openbookqa_eng
+- AraDiCE_openbookqa_lev
+- AraDiCE_openbookqa_msa
+- AraDiCE_piqa_egy
+- AraDiCE_piqa_eng
+- AraDiCE_piqa_lev
+- AraDiCE_piqa_msa
+- AraDiCE_truthfulqa_mc1_egy
+- AraDiCE_truthfulqa_mc1_eng
+- AraDiCE_truthfulqa_mc1_lev
+- AraDiCE_truthfulqa_mc1_msa
+- AraDiCE_winogrande_egy
+- AraDiCE_winogrande_eng
+- AraDiCE_winogrande_lev
+- AraDiCE_winogrande_msa
--- a/lm_eval/tasks/aradice/boolq/EGY/boolq_egy.yaml
+++ b/lm_eval/tasks/aradice/boolq/EGY/boolq_egy.yaml
+task: AraDiCE_boolq_egy
+dataset_path: QCRI/AraDiCE-BoolQ
+dataset_name: BoolQ-egy
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{passage}}\nسؤال: {{question}}؟\nجواب:"
+doc_to_target: target
+doc_to_choice: ["لا", "نعم"]
+should_decontaminate: true
+doc_to_decontamination_query: passage
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/boolq/EGY/metrics.py
+++ b/lm_eval/tasks/aradice/boolq/EGY/metrics.py
+from sklearn.metrics import f1_score
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/boolq/EGY/utils.py
+++ b/lm_eval/tasks/aradice/boolq/EGY/utils.py
+egy_answer_mapping = {"true": "نعم", "false": "لا", True: "نعم", False: "لا"}
+def process_docs(dataset):
+    def remove_question_mark(text):
+        text = text.strip()
+        if text.endswith("?") or text.endswith("؟"):
+            text = text[:-1]
+        text = text.strip()
+        return text
+    def _helper(doc):
+        doc["question"] = remove_question_mark(doc["question"])
+        doc["target"] = egy_answer_mapping[doc["answer"]]
+        return doc
+    return dataset.map(_helper)
--- a/lm_eval/tasks/aradice/boolq/ENG/boolq_eng.yaml
+++ b/lm_eval/tasks/aradice/boolq/ENG/boolq_eng.yaml
+task: AraDiCE_boolq_eng
+dataset_path: QCRI/AraDiCE-BoolQ
+dataset_name: BoolQ-eng
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
+doc_to_target: target
+doc_to_choice: ["no", "yes"]
+should_decontaminate: true
+doc_to_decontamination_query: passage
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/boolq/ENG/metrics.py
+++ b/lm_eval/tasks/aradice/boolq/ENG/metrics.py
+from sklearn.metrics import f1_score
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/boolq/ENG/utils.py
+++ b/lm_eval/tasks/aradice/boolq/ENG/utils.py
+en_answer_mapping = {"true": "yes", "false": "no", True: "yes", False: "no"}
+def process_docs(dataset):
+    def remove_question_mark(text):
+        text = text.strip()
+        if text.endswith("?") or text.endswith("؟"):
+            text = text[:-1]
+        text = text.strip()
+        return text
+    def _helper(doc):
+        doc["question"] = remove_question_mark(doc["question"])
+        doc["target"] = en_answer_mapping[doc["answer"]]
+        return doc
+    return dataset.map(_helper)
--- a/lm_eval/tasks/aradice/boolq/LEV/boolq_lev.yaml
+++ b/lm_eval/tasks/aradice/boolq/LEV/boolq_lev.yaml
+task: AraDiCE_boolq_lev
+dataset_path: QCRI/AraDiCE-BoolQ
+dataset_name: BoolQ-lev
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{passage}}\nسؤال: {{question}}؟\nجواب:"
+doc_to_target: target
+doc_to_choice: ["لا", "نعم"]
+should_decontaminate: true
+doc_to_decontamination_query: passage
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    higher_is_better: true
+    aggregation: !function metrics.micro_f1_score
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/aradice/boolq/LEV/metrics.py
+++ b/lm_eval/tasks/aradice/boolq/LEV/metrics.py
+from sklearn.metrics import f1_score
+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
+def micro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="micro")
+    return fscore
+def weighted_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="weighted")
+    return fscore
--- a/lm_eval/tasks/aradice/boolq/LEV/utils.py
+++ b/lm_eval/tasks/aradice/boolq/LEV/utils.py
+lev_answer_mapping = {"true": "نعم", "false": "لا", True: "نعم", False: "لا"}
+def process_docs(dataset):
+    def remove_question_mark(text):
+        text = text.strip()
+        if text.endswith("?") or text.endswith("؟"):
+            text = text[:-1]
+        text = text.strip()
+        return text
+    def _helper(doc):
+        doc["question"] = remove_question_mark(doc["question"])
+        doc["target"] = lev_answer_mapping[doc["answer"]]
+        return doc
+    return dataset.map(_helper)