Unverified Commit ac0bc1df authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

leaderboard - add subtask scores (#2867)

* add subtask scores

* pacify pre-commit
parent 6cc41d34
......@@ -9,4 +9,4 @@ include:
metric_list:
- metric: !function utils.bert
aggregation: !function utils.arabert
higher_is_better: true
\ No newline at end of file
higher_is_better: true
......@@ -9,4 +9,4 @@ include:
metric_list:
- metric: !function utils.bert
aggregation: !function utils.darijabert
higher_is_better: true
\ No newline at end of file
higher_is_better: true
......@@ -9,4 +9,4 @@ include:
metric_list:
- metric: !function utils.bert
aggregation: !function utils.darijabert
higher_is_better: true
\ No newline at end of file
higher_is_better: true
test_split: madar
\ No newline at end of file
test_split: madar
......@@ -9,4 +9,4 @@ include:
metric_list:
- metric: !function utils.bert
aggregation: !function utils.arabert
higher_is_better: true
\ No newline at end of file
higher_is_better: true
......@@ -9,4 +9,4 @@ include:
metric_list:
- metric: !function utils.bert
aggregation: !function utils.darijabert
higher_is_better: true
\ No newline at end of file
higher_is_better: true
test_split: seed
\ No newline at end of file
test_split: seed
......@@ -9,4 +9,4 @@ include:
metric_list:
- metric: !function utils.bert
aggregation: !function utils.bertbase
higher_is_better: true
\ No newline at end of file
higher_is_better: true
import evaluate
import datasets
import evaluate
def strip(resps, docs):
"""
Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
"""
return map(lambda r: r[0].strip(), resps)
def dr_fr(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "dr_fr")
def dr_en(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "dr_en")
def dr_msa(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "dr_msa")
def fr_dr(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "fr_dr")
def en_dr(dataset: datasets.Dataset):
def en_dr(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "en_dr")
def msa_dr(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "msa_dr")
return dataset.filter(lambda x: x["direction"] == "msa_dr")
prompt_templates = {
"fr_dr": "ترجم من الفرنساوية للدارجة:\n{0}",
"dr_fr": "ترجم من الدارجة للفرنساوية:\n{0}",
"en_dr": "ترجم من الإنجليزية للدارجة:\n{0}",
"dr_en": "ترجم من الدارجة للإنجليزية:\n{0}",
"msa_dr": "ترجم من الفصحى للدارجة:\n{0}",
"dr_msa": "ترجم من الدارجة للفصحى:\n{0}",
}
"fr_dr": "ترجم من الفرنساوية للدارجة:\n{0}",
"dr_fr": "ترجم من الدارجة للفرنساوية:\n{0}",
"en_dr": "ترجم من الإنجليزية للدارجة:\n{0}",
"dr_en": "ترجم من الدارجة للإنجليزية:\n{0}",
"msa_dr": "ترجم من الفصحى للدارجة:\n{0}",
"dr_msa": "ترجم من الدارجة للفصحى:\n{0}",
}
def doc_to_text(doc):
doc_text = doc["messages"][0]["content"]
return doc_text
def doc_to_target(doc):
return doc["messages"][1]["content"]
def bert(items):
return items
def Average(lst):
return sum(lst) / len(lst)
return sum(lst) / len(lst)
def camembert(items):
bert_model = 'almanach/camembert-base'
bert_model = "almanach/camembert-base"
bert_score = evaluate.load("bertscore")
predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12)
return Average(bert['f1'])
bert = bert_score.compute(
predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
def darijabert(items):
bert_model = 'SI2M-Lab/DarijaBERT'
bert_model = "SI2M-Lab/DarijaBERT"
bert_score = evaluate.load("bertscore")
predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12)
return Average(bert['f1'])
bert = bert_score.compute(
predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
def arabert(items):
bert_model = "aubmindlab/bert-base-arabert"
bert_score = evaluate.load("bertscore")
predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12)
return Average(bert['f1'])
bert = bert_score.compute(
predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
def bertbase(items):
bert_model = "google-bert/bert-base-uncased"
bert_score = evaluate.load("bertscore")
predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12)
return Average(bert['f1'])
bert = bert_score.compute(
predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
def mbert(items):
bert_model = 'google-bert/bert-base-multilingual-cased'
bert_model = "google-bert/bert-base-multilingual-cased"
bert_score = evaluate.load("bertscore")
predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12)
return Average(bert['f1'])
bert = bert_score.compute(
predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
......@@ -16,13 +16,13 @@ Homepage: [https://huggingface.co/datasets/MBZUAI-Paris/DarijaBench](https://hug
```
@article{shang2024atlaschatadaptinglargelanguage,
title={Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan Arabic Dialect},
title={Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan Arabic Dialect},
author={Guokan Shang and Hadi Abdine and Yousef Khoubrane and Amr Mohamed and Yassine Abbahaddou and Sofiane Ennadir and Imane Momayiz and Xuguang Ren and Eric Moulines and Preslav Nakov and Michalis Vazirgiannis and Eric Xing},
year={2024},
eprint={2409.17912},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.17912},
url={https://arxiv.org/abs/2409.17912},
}
```
......
import evaluate
import datasets
import evaluate
def strip(resps, docs):
......@@ -8,43 +8,66 @@ def strip(resps, docs):
"""
return map(lambda r: r[0].strip(), resps)
def dr_ar(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "dr_ar")
def ar_dr(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "ar_dr")
def doc_to_text(doc):
doc_text = doc["messages"][0]["content"]
return doc_text
def doc_to_target(doc):
return doc["messages"][1]["content"]
def bert(items):
return items
def Average(lst):
return sum(lst) / len(lst)
return sum(lst) / len(lst)
def arabizibert(items):
bert_model = "SI2M-Lab/DarijaBERT-arabizi"
bert_score = evaluate.load("bertscore")
predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12)
return Average(bert['f1'])
bert = bert_score.compute(
predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
def darijabert(items):
bert_model = 'SI2M-Lab/DarijaBERT'
bert_model = "SI2M-Lab/DarijaBERT"
bert_score = evaluate.load("bertscore")
predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12)
return Average(bert['f1'])
bert = bert_score.compute(
predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
def mbert(items):
bert_model = 'google-bert/bert-base-multilingual-cased'
bert_model = "google-bert/bert-base-multilingual-cased"
bert_score = evaluate.load("bertscore")
predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12)
return Average(bert['f1'])
bert = bert_score.compute(
predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
......@@ -16,13 +16,13 @@ Homepage: [https://huggingface.co/datasets/MBZUAI-Paris/DarijaHellaSwag](https:/
```
@article{shang2024atlaschatadaptinglargelanguage,
title={Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan Arabic Dialect},
title={Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan Arabic Dialect},
author={Guokan Shang and Hadi Abdine and Yousef Khoubrane and Amr Mohamed and Yassine Abbahaddou and Sofiane Ennadir and Imane Momayiz and Xuguang Ren and Eric Moulines and Preslav Nakov and Michalis Vazirgiannis and Eric Xing},
year={2024},
eprint={2409.17912},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.17912},
url={https://arxiv.org/abs/2409.17912},
}
```
......
......@@ -6,7 +6,7 @@ Title: Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan Arab
Abstract: [https://arxiv.org/abs/2409.17912](https://arxiv.org/abs/2409.17912)
DarijaMMLU is an evaluation benchmark designed to assess large language models' (LLM) performance in Moroccan Darija, a variety of Arabic. It consists of 22,027 multiple-choice questions, translated from selected subsets of the Massive Multitask Language Understanding (MMLU) and ArabicMMLU benchmarks to measure model performance on 44 subjects in Darija. DarijaMMLU is constructed by translating selected subsets from two major benchmarks into Darija from English and MSA: Massive Multitask Language Understanding (MMLU) and ArabicMMLU.
DarijaMMLU is an evaluation benchmark designed to assess large language models' (LLM) performance in Moroccan Darija, a variety of Arabic. It consists of 22,027 multiple-choice questions, translated from selected subsets of the Massive Multitask Language Understanding (MMLU) and ArabicMMLU benchmarks to measure model performance on 44 subjects in Darija. DarijaMMLU is constructed by translating selected subsets from two major benchmarks into Darija from English and MSA: Massive Multitask Language Understanding (MMLU) and ArabicMMLU.
Homepage: [https://huggingface.co/datasets/MBZUAI-Paris/DarijaMMLU](https://huggingface.co/datasets/MBZUAI-Paris/DarijaMMLU)
......@@ -16,17 +16,17 @@ Homepage: [https://huggingface.co/datasets/MBZUAI-Paris/DarijaMMLU](https://hugg
```
@article{shang2024atlaschatadaptinglargelanguage,
title={Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan Arabic Dialect},
title={Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan Arabic Dialect},
author={Guokan Shang and Hadi Abdine and Yousef Khoubrane and Amr Mohamed and Yassine Abbahaddou and Sofiane Ennadir and Imane Momayiz and Xuguang Ren and Eric Moulines and Preslav Nakov and Michalis Vazirgiannis and Eric Xing},
year={2024},
eprint={2409.17912},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.17912},
url={https://arxiv.org/abs/2409.17912},
}
```
### Groups and Tasks
### Groups and Tasks
#### Groups
......
......@@ -61,7 +61,7 @@ ARABIC_MMLU_SUBJECTS = {
"economics": "social_sciences",
"arabic_language_(general)": "language",
"arabic_language_(grammar)": "language",
"civics": "social_sciences"
"civics": "social_sciences",
}
DATASETS = {
......@@ -93,15 +93,16 @@ if __name__ == "__main__":
yaml_dict = {
"include": base_yaml_name,
"tag": [f"darijammlu_{category}_tasks", "darijammlu_"+dataset+"_tasks"],
"tag": [
f"darijammlu_{category}_tasks",
"darijammlu_" + dataset + "_tasks",
],
"task": f"darijammlu_{subject}",
"task_alias": subject.replace("_", " "),
"dataset_name": subject,
}
file_save_path = (
args.save_prefix_path + f"_{subject}.yaml"
)
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
......@@ -125,4 +126,4 @@ if __name__ == "__main__":
yaml_file,
indent=4,
default_flow_style=False,
)
\ No newline at end of file
)
......@@ -5,7 +5,6 @@ alpha = ["A.", "B.", "C.", "D.", "E."]
def doc_to_text(doc):
subject = doc["subject_darija"]
question = (
doc["question"]
......@@ -23,4 +22,4 @@ def doc_to_text(doc):
def doc_to_choice(doc):
return [alpha[i][0] for i in range(len(doc['choices']))]
return [alpha[i][0] for i in range(len(doc["choices"]))]
......@@ -24,3 +24,7 @@ task:
- leaderboard_bbh_tracking_shuffled_objects_seven_objects
- leaderboard_bbh_tracking_shuffled_objects_three_objects
- leaderboard_bbh_web_of_lies
aggregate_metric_list:
- metric: acc_norm
aggregation: mean
weight_by_size: true
......@@ -3,3 +3,7 @@ task:
- leaderboard_gpqa_diamond
- leaderboard_gpqa_extended
- leaderboard_gpqa_main
aggregate_metric_list:
- metric: acc_norm
aggregation: mean
weight_by_size: true
......@@ -6,3 +6,27 @@ task:
- leaderboard_math_hard
- leaderboard_ifeval
- leaderboard_musr
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
- metric: exact_match
aggregation: mean
weight_by_size: true
- metric: inst_level_loose_acc
aggregation: mean
weight_by_size: true
- metric: inst_level_strict_acc
aggregation: mean
weight_by_size: true
- metric: prompt_level_loose_acc
aggregation: mean
weight_by_size: true
- metric: prompt_level_strict_acc
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
......@@ -7,3 +7,7 @@ task:
- leaderboard_math_num_theory_hard
- leaderboard_math_prealgebra_hard
- leaderboard_math_precalculus_hard
aggregate_metric_list:
- metric: exact_match
aggregation: mean
weight_by_size: true
......@@ -3,3 +3,7 @@ task:
- leaderboard_musr_murder_mysteries
- leaderboard_musr_object_placements
- leaderboard_musr_team_allocation
aggregate_metric_list:
- metric: acc_norm
aggregation: mean
weight_by_size: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment