Unverified Commit 932e8f9e authored by Firoj Alam, Scientist, QCRI's avatar Firoj Alam, Scientist, QCRI Committed by GitHub
Browse files

AraDICE task config file (#2507)



* added aradice

* Added ArabicMMLU Lev Configs

* added ArabicMMLU egy configs

* Added boolq configs

* Added cultural bench configs

* added openbookqa configs

* Added PiQA configs

* added winogrande configs

* Added truthfulQA configs

* Added aradice group config

* Remove deleted files from repository

* modified arabimmlu configs

* modified metadata versions

* fixed formatting using ruff

* added aradice tasks information

* pre-commit

* Uptaded openbookqa utils

* fixed formatting on obqa

---------
Co-authored-by: default avatarBasel Mousi <bmousi@hbku.edu.qa>
Co-authored-by: default avatarBaber <baber@hey.com>
parent b86aa213
"dataset_name": "prof_humanities_law"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_humanities_lev"
"task": "AraDiCE_ArabicMMLU_prof_humanities_law_lev"
"task_alias": "prof humanities law"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "univ_other_management"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_other_lev"
"task": "AraDiCE_ArabicMMLU_univ_other_management_lev"
"task_alias": "univ other management"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "univ_social-science_accounting"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_social-science_lev"
"task": "AraDiCE_ArabicMMLU_univ_social-science_accounting_lev"
"task_alias": "univ social-science accounting"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "univ_social-science_economics"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_social-science_lev"
"task": "AraDiCE_ArabicMMLU_univ_social-science_economics_lev"
"task_alias": "univ social-science economics"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "univ_social-science_political-science"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_social-science_lev"
"task": "AraDiCE_ArabicMMLU_univ_social-science_political-science_lev"
"task_alias": "univ social-science political-science"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
"dataset_name": "univ_stem_computer-science"
"description": ""
"fewshot_split": !!null "null"
"include": "_default_template_yaml"
"tag": "AraDiCE_ArabicMMLU_stem_lev"
"task": "AraDiCE_ArabicMMLU_univ_stem_computer-science_lev"
"task_alias": "univ stem computer-science"
"test_split": "test"
"training_split": !!null "null"
"validation_split": !!null "null"
dataset_path: QCRI/AraDICE-ArabicMMLU-lev
fewshot_config:
sampler: default
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "{{prompt}}"
doc_to_choice: choices
doc_to_target: target
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 0.0
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
level_ar = {
"Primary": "للمرحلة الابتدائية",
"Middle": "للمرحلة المتوسطة",
"High": "للمرحلة الثانوية",
"Univ": "للمرحلة الجامعية ",
"Prof": "للمحترفين",
}
country_ar = {
"UAE": "بالإمارات",
"Egypt": "بمصر",
"Lebanon": "بلبنان",
"Jordan": "بالأردن",
"Kuwait": "بالكويت",
"KSA": "بالسعودية",
"Palestine": "بفلسطين",
"Morocco": "بالمغرب",
}
subject_ar = {
"Islamic Studies": "عن الدراسات إسلامية",
"Driving Test": "عن فحص السواقة",
"Natural Science": "عن العلوم الطبيعية",
"History": "تاريخ",
"General Knowledge": "معرفة عامة",
"Law": "عن القانون",
"Physics": "فيزياء",
"Social Science": "علوم اجتماعية",
"Management": "عن الإدارة",
"Arabic Language": "عن اللغة العربية",
"Political Science": " عن العلوم السياسية",
"Philosophy": "فلسفة",
"Accounting": "محاسبة",
"Computer Science": "عن علوم الحاسوب",
"Geography": "جغرافيا",
"Math": "رياضيات",
"Biology": "بيولوجي",
"Economics": "اقتصاد",
"Arabic Language (General)": "لغة العربية (عام)",
"Arabic Language (Grammar)": "لغة العربية (نحو)",
"Civics": "تربية مدنية",
}
alpa_ar = ["أ-", "ب-", "ج-", "د-", "و-"]
alpa_en = ["A-", "B-", "C-", "D-", "E-"]
all_choices = ["أ", "ب", "ج", "د", "و"]
all_choices_en = ["A", "B", "C", "D", "E"]
def process_docs(dataset):
def _helper(doc):
# modifies the contents of a single
# document in our dataset.
PROMPT = (
"هيدا سؤال [MAIN_META_DATA]. نقي الجواب الصح!\n\nسؤال: [INPUT]\n[OPTION]"
)
# if args.lora_weights == "x":
PROMPT = f"{PROMPT}\n\nالجواب:"
# else:
# PROMPT = f'### Input:{PROMPT}\n\n### Output:\n'
alpa = alpa_ar
subject = subject_ar[doc["Subject"]]
level = " " + level_ar[doc["Level"]] if doc["Level"] else ""
country = " " + country_ar[doc["Country"]] if doc["Country"] else ""
main_meta_data = f"{subject}{level}{country}"
question = (
f"{doc['context']}\n\n{doc['question']}"
if doc["context"]
else doc["question"]
)
options = []
for i, opt in enumerate(["A", "B", "C", "D", "E"]):
if opt not in doc["options"] or doc["options"][opt] is None:
break
options.append(f"{alpa[i]} {doc['options'][opt]}")
doc["prompt"] = (
PROMPT.replace("[MAIN_META_DATA]", main_meta_data)
.replace("[INPUT]", question)
.replace("[OPTION]", "\n".join(options))
)
doc["choices"] = all_choices[: len(options)]
doc["target"] = ["A", "B", "C", "D", "E"].index(doc["Answer Key"])
return doc
return dataset.map(_helper)
# AraDiCE
### Paper
**Title:** AraDiCE: Benchmarks for Dialectal and Cultural Capabilities in LLMs
**Abstract:** Arabic, with its rich diversity of dialects, remains significantly underrepresented in Large Language Models, particularly in dialectal variations. We address this gap by introducing seven synthetic datasets in dialects alongside Modern Standard Arabic (MSA), created using Machine Translation (MT) combined with human post-editing. We present AraDiCE, a benchmark for Arabic Dialect and Cultural Evaluation. We evaluate LLMs on dialect comprehension and generation, focusing specifically on low-resource Arabic dialects. Additionally, we introduce the first-ever fine-grained benchmark designed to evaluate cultural awareness across the Gulf, Egypt, and Levant regions, providing a novel dimension to LLM evaluation. Our findings demonstrate that while Arabic-specific models like Jais and AceGPT outperform multilingual models on dialectal tasks, significant challenges persist in dialect identification, generation, and translation. This work contributes ~45K post-edited samples, a cultural benchmark, and highlights the importance of tailored training to improve LLM performance in capturing the nuances of diverse Arabic dialects and cultural contexts. We will release the dialectal translation models and benchmarks curated in this study.
**Homepage:**
https://huggingface.co/datasets/QCRI/AraDiCE
### Citation
```
@article{mousi2024aradicebenchmarksdialectalcultural,
title={{AraDiCE}: Benchmarks for Dialectal and Cultural Capabilities in LLMs},
author={Basel Mousi and Nadir Durrani and Fatema Ahmad and Md. Arid Hasan and Maram Hasanain and Tameem Kabbani and Fahim Dalvi and Shammur Absar Chowdhury and Firoj Alam},
year={2024},
publisher={arXiv:2409.11404},
url={https://arxiv.org/abs/2409.11404},
}
```
### Groups, Tags, and Tasks
#### Groups
* `AraDiCE`: Overall results for all tasks associated with different datasets.
#### Tasks
* `aradice`: Overall results for all tasks associated with different datasets.
* `arabicmmlu`: TODO
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
group: AraDiCE
task:
- AraDiCE_ArabicMMLU_lev
- AraDiCE_ArabicMMLU_egy
- AraDiCE_boolq_egy
- AraDiCE_boolq_eng
- AraDiCE_boolq_lev
- AraDiCE_boolq_msa
- AraDiCE_egypt_cultural
- AraDiCE_jordan_cultural
- AraDiCE_lebanon_cultural
- AraDiCE_palestine_cultural
- AraDiCE_qatar_cultural
- AraDiCE_syria_cultural
- AraDiCE_openbookqa_egy
- AraDiCE_openbookqa_eng
- AraDiCE_openbookqa_lev
- AraDiCE_openbookqa_msa
- AraDiCE_piqa_egy
- AraDiCE_piqa_eng
- AraDiCE_piqa_lev
- AraDiCE_piqa_msa
- AraDiCE_truthfulqa_mc1_egy
- AraDiCE_truthfulqa_mc1_eng
- AraDiCE_truthfulqa_mc1_lev
- AraDiCE_truthfulqa_mc1_msa
- AraDiCE_winogrande_egy
- AraDiCE_winogrande_eng
- AraDiCE_winogrande_lev
- AraDiCE_winogrande_msa
task: AraDiCE_boolq_egy
dataset_path: QCRI/AraDiCE-BoolQ
dataset_name: BoolQ-egy
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{passage}}\nسؤال: {{question}}؟\nجواب:"
doc_to_target: target
doc_to_choice: ["لا", "نعم"]
should_decontaminate: true
doc_to_decontamination_query: passage
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
egy_answer_mapping = {"true": "نعم", "false": "لا", True: "نعم", False: "لا"}
def process_docs(dataset):
def remove_question_mark(text):
text = text.strip()
if text.endswith("?") or text.endswith("؟"):
text = text[:-1]
text = text.strip()
return text
def _helper(doc):
doc["question"] = remove_question_mark(doc["question"])
doc["target"] = egy_answer_mapping[doc["answer"]]
return doc
return dataset.map(_helper)
task: AraDiCE_boolq_eng
dataset_path: QCRI/AraDiCE-BoolQ
dataset_name: BoolQ-eng
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
doc_to_target: target
doc_to_choice: ["no", "yes"]
should_decontaminate: true
doc_to_decontamination_query: passage
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
en_answer_mapping = {"true": "yes", "false": "no", True: "yes", False: "no"}
def process_docs(dataset):
def remove_question_mark(text):
text = text.strip()
if text.endswith("?") or text.endswith("؟"):
text = text[:-1]
text = text.strip()
return text
def _helper(doc):
doc["question"] = remove_question_mark(doc["question"])
doc["target"] = en_answer_mapping[doc["answer"]]
return doc
return dataset.map(_helper)
task: AraDiCE_boolq_lev
dataset_path: QCRI/AraDiCE-BoolQ
dataset_name: BoolQ-lev
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{passage}}\nسؤال: {{question}}؟\nجواب:"
doc_to_target: target
doc_to_choice: ["لا", "نعم"]
should_decontaminate: true
doc_to_decontamination_query: passage
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
lev_answer_mapping = {"true": "نعم", "false": "لا", True: "نعم", False: "لا"}
def process_docs(dataset):
def remove_question_mark(text):
text = text.strip()
if text.endswith("?") or text.endswith("؟"):
text = text[:-1]
text = text.strip()
return text
def _helper(doc):
doc["question"] = remove_question_mark(doc["question"])
doc["target"] = lev_answer_mapping[doc["answer"]]
return doc
return dataset.map(_helper)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment