Unverified Commit ac0bc1df authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

leaderboard - add subtask scores (#2867)

* add subtask scores

* pacify pre-commit
parent 6cc41d34
import evaluate
import datasets import datasets
import evaluate
def strip(resps, docs): def strip(resps, docs):
...@@ -12,21 +12,27 @@ def strip(resps, docs): ...@@ -12,21 +12,27 @@ def strip(resps, docs):
def dr_fr(dataset: datasets.Dataset): def dr_fr(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "dr_fr") return dataset.filter(lambda x: x["direction"] == "dr_fr")
def dr_en(dataset: datasets.Dataset): def dr_en(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "dr_en") return dataset.filter(lambda x: x["direction"] == "dr_en")
def dr_msa(dataset: datasets.Dataset): def dr_msa(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "dr_msa") return dataset.filter(lambda x: x["direction"] == "dr_msa")
def fr_dr(dataset: datasets.Dataset): def fr_dr(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "fr_dr") return dataset.filter(lambda x: x["direction"] == "fr_dr")
def en_dr(dataset: datasets.Dataset): def en_dr(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "en_dr") return dataset.filter(lambda x: x["direction"] == "en_dr")
def msa_dr(dataset: datasets.Dataset): def msa_dr(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "msa_dr") return dataset.filter(lambda x: x["direction"] == "msa_dr")
prompt_templates = { prompt_templates = {
"fr_dr": "ترجم من الفرنساوية للدارجة:\n{0}", "fr_dr": "ترجم من الفرنساوية للدارجة:\n{0}",
"dr_fr": "ترجم من الدارجة للفرنساوية:\n{0}", "dr_fr": "ترجم من الدارجة للفرنساوية:\n{0}",
...@@ -34,52 +40,86 @@ prompt_templates = { ...@@ -34,52 +40,86 @@ prompt_templates = {
"dr_en": "ترجم من الدارجة للإنجليزية:\n{0}", "dr_en": "ترجم من الدارجة للإنجليزية:\n{0}",
"msa_dr": "ترجم من الفصحى للدارجة:\n{0}", "msa_dr": "ترجم من الفصحى للدارجة:\n{0}",
"dr_msa": "ترجم من الدارجة للفصحى:\n{0}", "dr_msa": "ترجم من الدارجة للفصحى:\n{0}",
} }
def doc_to_text(doc): def doc_to_text(doc):
doc_text = doc["messages"][0]["content"] doc_text = doc["messages"][0]["content"]
return doc_text return doc_text
def doc_to_target(doc): def doc_to_target(doc):
return doc["messages"][1]["content"] return doc["messages"][1]["content"]
def bert(items): def bert(items):
return items return items
def Average(lst): def Average(lst):
return sum(lst) / len(lst) return sum(lst) / len(lst)
def camembert(items): def camembert(items):
bert_model = 'almanach/camembert-base' bert_model = "almanach/camembert-base"
bert_score = evaluate.load("bertscore") bert_score = evaluate.load("bertscore")
predictions, references = zip(*items) predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12) bert = bert_score.compute(
return Average(bert['f1']) predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
def darijabert(items): def darijabert(items):
bert_model = 'SI2M-Lab/DarijaBERT' bert_model = "SI2M-Lab/DarijaBERT"
bert_score = evaluate.load("bertscore") bert_score = evaluate.load("bertscore")
predictions, references = zip(*items) predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12) bert = bert_score.compute(
return Average(bert['f1']) predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
def arabert(items): def arabert(items):
bert_model = "aubmindlab/bert-base-arabert" bert_model = "aubmindlab/bert-base-arabert"
bert_score = evaluate.load("bertscore") bert_score = evaluate.load("bertscore")
predictions, references = zip(*items) predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12) bert = bert_score.compute(
return Average(bert['f1']) predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
def bertbase(items): def bertbase(items):
bert_model = "google-bert/bert-base-uncased" bert_model = "google-bert/bert-base-uncased"
bert_score = evaluate.load("bertscore") bert_score = evaluate.load("bertscore")
predictions, references = zip(*items) predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12) bert = bert_score.compute(
return Average(bert['f1']) predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
def mbert(items): def mbert(items):
bert_model = 'google-bert/bert-base-multilingual-cased' bert_model = "google-bert/bert-base-multilingual-cased"
bert_score = evaluate.load("bertscore") bert_score = evaluate.load("bertscore")
predictions, references = zip(*items) predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12) bert = bert_score.compute(
return Average(bert['f1']) predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
import evaluate
import datasets import datasets
import evaluate
def strip(resps, docs): def strip(resps, docs):
...@@ -12,39 +12,62 @@ def strip(resps, docs): ...@@ -12,39 +12,62 @@ def strip(resps, docs):
def dr_ar(dataset: datasets.Dataset): def dr_ar(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "dr_ar") return dataset.filter(lambda x: x["direction"] == "dr_ar")
def ar_dr(dataset: datasets.Dataset): def ar_dr(dataset: datasets.Dataset):
return dataset.filter(lambda x: x["direction"] == "ar_dr") return dataset.filter(lambda x: x["direction"] == "ar_dr")
def doc_to_text(doc): def doc_to_text(doc):
doc_text = doc["messages"][0]["content"] doc_text = doc["messages"][0]["content"]
return doc_text return doc_text
def doc_to_target(doc): def doc_to_target(doc):
return doc["messages"][1]["content"] return doc["messages"][1]["content"]
def bert(items): def bert(items):
return items return items
def Average(lst): def Average(lst):
return sum(lst) / len(lst) return sum(lst) / len(lst)
def arabizibert(items): def arabizibert(items):
bert_model = "SI2M-Lab/DarijaBERT-arabizi" bert_model = "SI2M-Lab/DarijaBERT-arabizi"
bert_score = evaluate.load("bertscore") bert_score = evaluate.load("bertscore")
predictions, references = zip(*items) predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12) bert = bert_score.compute(
return Average(bert['f1']) predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
def darijabert(items): def darijabert(items):
bert_model = 'SI2M-Lab/DarijaBERT' bert_model = "SI2M-Lab/DarijaBERT"
bert_score = evaluate.load("bertscore") bert_score = evaluate.load("bertscore")
predictions, references = zip(*items) predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12) bert = bert_score.compute(
return Average(bert['f1']) predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
def mbert(items): def mbert(items):
bert_model = 'google-bert/bert-base-multilingual-cased' bert_model = "google-bert/bert-base-multilingual-cased"
bert_score = evaluate.load("bertscore") bert_score = evaluate.load("bertscore")
predictions, references = zip(*items) predictions, references = zip(*items)
bert = bert_score.compute(predictions=predictions, references=references, model_type=bert_model, num_layers=12) bert = bert_score.compute(
return Average(bert['f1']) predictions=predictions,
references=references,
model_type=bert_model,
num_layers=12,
)
return Average(bert["f1"])
...@@ -61,7 +61,7 @@ ARABIC_MMLU_SUBJECTS = { ...@@ -61,7 +61,7 @@ ARABIC_MMLU_SUBJECTS = {
"economics": "social_sciences", "economics": "social_sciences",
"arabic_language_(general)": "language", "arabic_language_(general)": "language",
"arabic_language_(grammar)": "language", "arabic_language_(grammar)": "language",
"civics": "social_sciences" "civics": "social_sciences",
} }
DATASETS = { DATASETS = {
...@@ -93,15 +93,16 @@ if __name__ == "__main__": ...@@ -93,15 +93,16 @@ if __name__ == "__main__":
yaml_dict = { yaml_dict = {
"include": base_yaml_name, "include": base_yaml_name,
"tag": [f"darijammlu_{category}_tasks", "darijammlu_"+dataset+"_tasks"], "tag": [
f"darijammlu_{category}_tasks",
"darijammlu_" + dataset + "_tasks",
],
"task": f"darijammlu_{subject}", "task": f"darijammlu_{subject}",
"task_alias": subject.replace("_", " "), "task_alias": subject.replace("_", " "),
"dataset_name": subject, "dataset_name": subject,
} }
file_save_path = ( file_save_path = args.save_prefix_path + f"_{subject}.yaml"
args.save_prefix_path + f"_{subject}.yaml"
)
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}") eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file: with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump( yaml.dump(
......
...@@ -5,7 +5,6 @@ alpha = ["A.", "B.", "C.", "D.", "E."] ...@@ -5,7 +5,6 @@ alpha = ["A.", "B.", "C.", "D.", "E."]
def doc_to_text(doc): def doc_to_text(doc):
subject = doc["subject_darija"] subject = doc["subject_darija"]
question = ( question = (
doc["question"] doc["question"]
...@@ -23,4 +22,4 @@ def doc_to_text(doc): ...@@ -23,4 +22,4 @@ def doc_to_text(doc):
def doc_to_choice(doc): def doc_to_choice(doc):
return [alpha[i][0] for i in range(len(doc['choices']))] return [alpha[i][0] for i in range(len(doc["choices"]))]
...@@ -24,3 +24,7 @@ task: ...@@ -24,3 +24,7 @@ task:
- leaderboard_bbh_tracking_shuffled_objects_seven_objects - leaderboard_bbh_tracking_shuffled_objects_seven_objects
- leaderboard_bbh_tracking_shuffled_objects_three_objects - leaderboard_bbh_tracking_shuffled_objects_three_objects
- leaderboard_bbh_web_of_lies - leaderboard_bbh_web_of_lies
aggregate_metric_list:
- metric: acc_norm
aggregation: mean
weight_by_size: true
...@@ -3,3 +3,7 @@ task: ...@@ -3,3 +3,7 @@ task:
- leaderboard_gpqa_diamond - leaderboard_gpqa_diamond
- leaderboard_gpqa_extended - leaderboard_gpqa_extended
- leaderboard_gpqa_main - leaderboard_gpqa_main
aggregate_metric_list:
- metric: acc_norm
aggregation: mean
weight_by_size: true
...@@ -6,3 +6,27 @@ task: ...@@ -6,3 +6,27 @@ task:
- leaderboard_math_hard - leaderboard_math_hard
- leaderboard_ifeval - leaderboard_ifeval
- leaderboard_musr - leaderboard_musr
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
- metric: exact_match
aggregation: mean
weight_by_size: true
- metric: inst_level_loose_acc
aggregation: mean
weight_by_size: true
- metric: inst_level_strict_acc
aggregation: mean
weight_by_size: true
- metric: prompt_level_loose_acc
aggregation: mean
weight_by_size: true
- metric: prompt_level_strict_acc
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
...@@ -7,3 +7,7 @@ task: ...@@ -7,3 +7,7 @@ task:
- leaderboard_math_num_theory_hard - leaderboard_math_num_theory_hard
- leaderboard_math_prealgebra_hard - leaderboard_math_prealgebra_hard
- leaderboard_math_precalculus_hard - leaderboard_math_precalculus_hard
aggregate_metric_list:
- metric: exact_match
aggregation: mean
weight_by_size: true
...@@ -3,3 +3,7 @@ task: ...@@ -3,3 +3,7 @@ task:
- leaderboard_musr_murder_mysteries - leaderboard_musr_murder_mysteries
- leaderboard_musr_object_placements - leaderboard_musr_object_placements
- leaderboard_musr_team_allocation - leaderboard_musr_team_allocation
aggregate_metric_list:
- metric: acc_norm
aggregation: mean
weight_by_size: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment