Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
ac0bc1df
Unverified
Commit
ac0bc1df
authored
Apr 02, 2025
by
Baber Abbasi
Committed by
GitHub
Apr 02, 2025
Browse files
leaderboard - add subtask scores (#2867)
* add subtask scores * pacify pre-commit
parent
6cc41d34
Changes
40
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
167 additions
and
64 deletions
+167
-64
lm_eval/tasks/darija_bench/darija_translation/flores_translation_dr_msa.yaml
...a_bench/darija_translation/flores_translation_dr_msa.yaml
+1
-1
lm_eval/tasks/darija_bench/darija_translation/flores_translation_fr_dr.yaml
...ja_bench/darija_translation/flores_translation_fr_dr.yaml
+1
-1
lm_eval/tasks/darija_bench/darija_translation/flores_translation_msa_dr.yaml
...a_bench/darija_translation/flores_translation_msa_dr.yaml
+1
-1
lm_eval/tasks/darija_bench/darija_translation/madar_common_yaml
...l/tasks/darija_bench/darija_translation/madar_common_yaml
+1
-1
lm_eval/tasks/darija_bench/darija_translation/madar_translation_dr_msa.yaml
...ja_bench/darija_translation/madar_translation_dr_msa.yaml
+1
-1
lm_eval/tasks/darija_bench/darija_translation/madar_translation_msa_dr.yaml
...ja_bench/darija_translation/madar_translation_msa_dr.yaml
+1
-1
lm_eval/tasks/darija_bench/darija_translation/seed_common_yaml
...al/tasks/darija_bench/darija_translation/seed_common_yaml
+1
-1
lm_eval/tasks/darija_bench/darija_translation/seed_translation_dr_en.yaml
...rija_bench/darija_translation/seed_translation_dr_en.yaml
+1
-1
lm_eval/tasks/darija_bench/darija_translation/utils.py
lm_eval/tasks/darija_bench/darija_translation/utils.py
+68
-28
lm_eval/tasks/darija_bench/darija_transliteration/README.md
lm_eval/tasks/darija_bench/darija_transliteration/README.md
+2
-2
lm_eval/tasks/darija_bench/darija_transliteration/utils.py
lm_eval/tasks/darija_bench/darija_transliteration/utils.py
+35
-12
lm_eval/tasks/darijahellaswag/README.md
lm_eval/tasks/darijahellaswag/README.md
+2
-2
lm_eval/tasks/darijammlu/README.md
lm_eval/tasks/darijammlu/README.md
+4
-4
lm_eval/tasks/darijammlu/_generate_configs.py
lm_eval/tasks/darijammlu/_generate_configs.py
+7
-6
lm_eval/tasks/darijammlu/utils.py
lm_eval/tasks/darijammlu/utils.py
+1
-2
lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml
lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml
+4
-0
lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml
lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml
+4
-0
lm_eval/tasks/leaderboard/leaderboard.yaml
lm_eval/tasks/leaderboard/leaderboard.yaml
+24
-0
lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml
lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml
+4
-0
lm_eval/tasks/leaderboard/musr/_musr.yaml
lm_eval/tasks/leaderboard/musr/_musr.yaml
+4
-0
No files found.
lm_eval/tasks/darija_bench/darija_translation/flores_translation_dr_msa.yaml
View file @
ac0bc1df
lm_eval/tasks/darija_bench/darija_translation/flores_translation_fr_dr.yaml
View file @
ac0bc1df
lm_eval/tasks/darija_bench/darija_translation/flores_translation_msa_dr.yaml
View file @
ac0bc1df
lm_eval/tasks/darija_bench/darija_translation/madar_common_yaml
View file @
ac0bc1df
lm_eval/tasks/darija_bench/darija_translation/madar_translation_dr_msa.yaml
View file @
ac0bc1df
lm_eval/tasks/darija_bench/darija_translation/madar_translation_msa_dr.yaml
View file @
ac0bc1df
lm_eval/tasks/darija_bench/darija_translation/seed_common_yaml
View file @
ac0bc1df
lm_eval/tasks/darija_bench/darija_translation/seed_translation_dr_en.yaml
View file @
ac0bc1df
lm_eval/tasks/darija_bench/darija_translation/utils.py
View file @
ac0bc1df
import
evaluate
import
datasets
import
datasets
import
evaluate
def
strip
(
resps
,
docs
):
def
strip
(
resps
,
docs
):
...
@@ -12,21 +12,27 @@ def strip(resps, docs):
...
@@ -12,21 +12,27 @@ def strip(resps, docs):
def
dr_fr
(
dataset
:
datasets
.
Dataset
):
def
dr_fr
(
dataset
:
datasets
.
Dataset
):
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"dr_fr"
)
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"dr_fr"
)
def
dr_en
(
dataset
:
datasets
.
Dataset
):
def
dr_en
(
dataset
:
datasets
.
Dataset
):
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"dr_en"
)
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"dr_en"
)
def
dr_msa
(
dataset
:
datasets
.
Dataset
):
def
dr_msa
(
dataset
:
datasets
.
Dataset
):
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"dr_msa"
)
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"dr_msa"
)
def
fr_dr
(
dataset
:
datasets
.
Dataset
):
def
fr_dr
(
dataset
:
datasets
.
Dataset
):
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"fr_dr"
)
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"fr_dr"
)
def
en_dr
(
dataset
:
datasets
.
Dataset
):
def
en_dr
(
dataset
:
datasets
.
Dataset
):
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"en_dr"
)
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"en_dr"
)
def
msa_dr
(
dataset
:
datasets
.
Dataset
):
def
msa_dr
(
dataset
:
datasets
.
Dataset
):
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"msa_dr"
)
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"msa_dr"
)
prompt_templates
=
{
prompt_templates
=
{
"fr_dr"
:
"ترجم من الفرنساوية للدارجة:
\n
{0}"
,
"fr_dr"
:
"ترجم من الفرنساوية للدارجة:
\n
{0}"
,
"dr_fr"
:
"ترجم من الدارجة للفرنساوية:
\n
{0}"
,
"dr_fr"
:
"ترجم من الدارجة للفرنساوية:
\n
{0}"
,
...
@@ -34,52 +40,86 @@ prompt_templates = {
...
@@ -34,52 +40,86 @@ prompt_templates = {
"dr_en"
:
"ترجم من الدارجة للإنجليزية:
\n
{0}"
,
"dr_en"
:
"ترجم من الدارجة للإنجليزية:
\n
{0}"
,
"msa_dr"
:
"ترجم من الفصحى للدارجة:
\n
{0}"
,
"msa_dr"
:
"ترجم من الفصحى للدارجة:
\n
{0}"
,
"dr_msa"
:
"ترجم من الدارجة للفصحى:
\n
{0}"
,
"dr_msa"
:
"ترجم من الدارجة للفصحى:
\n
{0}"
,
}
}
def
doc_to_text
(
doc
):
def
doc_to_text
(
doc
):
doc_text
=
doc
[
"messages"
][
0
][
"content"
]
doc_text
=
doc
[
"messages"
][
0
][
"content"
]
return
doc_text
return
doc_text
def
doc_to_target
(
doc
):
def
doc_to_target
(
doc
):
return
doc
[
"messages"
][
1
][
"content"
]
return
doc
[
"messages"
][
1
][
"content"
]
def
bert
(
items
):
def
bert
(
items
):
return
items
return
items
def
Average
(
lst
):
def
Average
(
lst
):
return
sum
(
lst
)
/
len
(
lst
)
return
sum
(
lst
)
/
len
(
lst
)
def
camembert
(
items
):
def
camembert
(
items
):
bert_model
=
'
almanach/camembert-base
'
bert_model
=
"
almanach/camembert-base
"
bert_score
=
evaluate
.
load
(
"bertscore"
)
bert_score
=
evaluate
.
load
(
"bertscore"
)
predictions
,
references
=
zip
(
*
items
)
predictions
,
references
=
zip
(
*
items
)
bert
=
bert_score
.
compute
(
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
)
bert
=
bert_score
.
compute
(
return
Average
(
bert
[
'f1'
])
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
,
)
return
Average
(
bert
[
"f1"
])
def
darijabert
(
items
):
def
darijabert
(
items
):
bert_model
=
'
SI2M-Lab/DarijaBERT
'
bert_model
=
"
SI2M-Lab/DarijaBERT
"
bert_score
=
evaluate
.
load
(
"bertscore"
)
bert_score
=
evaluate
.
load
(
"bertscore"
)
predictions
,
references
=
zip
(
*
items
)
predictions
,
references
=
zip
(
*
items
)
bert
=
bert_score
.
compute
(
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
)
bert
=
bert_score
.
compute
(
return
Average
(
bert
[
'f1'
])
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
,
)
return
Average
(
bert
[
"f1"
])
def
arabert
(
items
):
def
arabert
(
items
):
bert_model
=
"aubmindlab/bert-base-arabert"
bert_model
=
"aubmindlab/bert-base-arabert"
bert_score
=
evaluate
.
load
(
"bertscore"
)
bert_score
=
evaluate
.
load
(
"bertscore"
)
predictions
,
references
=
zip
(
*
items
)
predictions
,
references
=
zip
(
*
items
)
bert
=
bert_score
.
compute
(
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
)
bert
=
bert_score
.
compute
(
return
Average
(
bert
[
'f1'
])
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
,
)
return
Average
(
bert
[
"f1"
])
def
bertbase
(
items
):
def
bertbase
(
items
):
bert_model
=
"google-bert/bert-base-uncased"
bert_model
=
"google-bert/bert-base-uncased"
bert_score
=
evaluate
.
load
(
"bertscore"
)
bert_score
=
evaluate
.
load
(
"bertscore"
)
predictions
,
references
=
zip
(
*
items
)
predictions
,
references
=
zip
(
*
items
)
bert
=
bert_score
.
compute
(
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
)
bert
=
bert_score
.
compute
(
return
Average
(
bert
[
'f1'
])
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
,
)
return
Average
(
bert
[
"f1"
])
def
mbert
(
items
):
def
mbert
(
items
):
bert_model
=
'
google-bert/bert-base-multilingual-cased
'
bert_model
=
"
google-bert/bert-base-multilingual-cased
"
bert_score
=
evaluate
.
load
(
"bertscore"
)
bert_score
=
evaluate
.
load
(
"bertscore"
)
predictions
,
references
=
zip
(
*
items
)
predictions
,
references
=
zip
(
*
items
)
bert
=
bert_score
.
compute
(
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
)
bert
=
bert_score
.
compute
(
return
Average
(
bert
[
'f1'
])
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
,
)
return
Average
(
bert
[
"f1"
])
lm_eval/tasks/darija_bench/darija_transliteration/README.md
View file @
ac0bc1df
lm_eval/tasks/darija_bench/darija_transliteration/utils.py
View file @
ac0bc1df
import
evaluate
import
datasets
import
datasets
import
evaluate
def
strip
(
resps
,
docs
):
def
strip
(
resps
,
docs
):
...
@@ -12,39 +12,62 @@ def strip(resps, docs):
...
@@ -12,39 +12,62 @@ def strip(resps, docs):
def
dr_ar
(
dataset
:
datasets
.
Dataset
):
def
dr_ar
(
dataset
:
datasets
.
Dataset
):
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"dr_ar"
)
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"dr_ar"
)
def
ar_dr
(
dataset
:
datasets
.
Dataset
):
def
ar_dr
(
dataset
:
datasets
.
Dataset
):
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"ar_dr"
)
return
dataset
.
filter
(
lambda
x
:
x
[
"direction"
]
==
"ar_dr"
)
def
doc_to_text
(
doc
):
def
doc_to_text
(
doc
):
doc_text
=
doc
[
"messages"
][
0
][
"content"
]
doc_text
=
doc
[
"messages"
][
0
][
"content"
]
return
doc_text
return
doc_text
def
doc_to_target
(
doc
):
def
doc_to_target
(
doc
):
return
doc
[
"messages"
][
1
][
"content"
]
return
doc
[
"messages"
][
1
][
"content"
]
def
bert
(
items
):
def
bert
(
items
):
return
items
return
items
def
Average
(
lst
):
def
Average
(
lst
):
return
sum
(
lst
)
/
len
(
lst
)
return
sum
(
lst
)
/
len
(
lst
)
def
arabizibert
(
items
):
def
arabizibert
(
items
):
bert_model
=
"SI2M-Lab/DarijaBERT-arabizi"
bert_model
=
"SI2M-Lab/DarijaBERT-arabizi"
bert_score
=
evaluate
.
load
(
"bertscore"
)
bert_score
=
evaluate
.
load
(
"bertscore"
)
predictions
,
references
=
zip
(
*
items
)
predictions
,
references
=
zip
(
*
items
)
bert
=
bert_score
.
compute
(
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
)
bert
=
bert_score
.
compute
(
return
Average
(
bert
[
'f1'
])
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
,
)
return
Average
(
bert
[
"f1"
])
def
darijabert
(
items
):
def
darijabert
(
items
):
bert_model
=
'
SI2M-Lab/DarijaBERT
'
bert_model
=
"
SI2M-Lab/DarijaBERT
"
bert_score
=
evaluate
.
load
(
"bertscore"
)
bert_score
=
evaluate
.
load
(
"bertscore"
)
predictions
,
references
=
zip
(
*
items
)
predictions
,
references
=
zip
(
*
items
)
bert
=
bert_score
.
compute
(
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
)
bert
=
bert_score
.
compute
(
return
Average
(
bert
[
'f1'
])
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
,
)
return
Average
(
bert
[
"f1"
])
def
mbert
(
items
):
def
mbert
(
items
):
bert_model
=
'
google-bert/bert-base-multilingual-cased
'
bert_model
=
"
google-bert/bert-base-multilingual-cased
"
bert_score
=
evaluate
.
load
(
"bertscore"
)
bert_score
=
evaluate
.
load
(
"bertscore"
)
predictions
,
references
=
zip
(
*
items
)
predictions
,
references
=
zip
(
*
items
)
bert
=
bert_score
.
compute
(
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
)
bert
=
bert_score
.
compute
(
return
Average
(
bert
[
'f1'
])
predictions
=
predictions
,
references
=
references
,
model_type
=
bert_model
,
num_layers
=
12
,
)
return
Average
(
bert
[
"f1"
])
lm_eval/tasks/darijahellaswag/README.md
View file @
ac0bc1df
lm_eval/tasks/darijammlu/README.md
View file @
ac0bc1df
lm_eval/tasks/darijammlu/_generate_configs.py
View file @
ac0bc1df
...
@@ -61,7 +61,7 @@ ARABIC_MMLU_SUBJECTS = {
...
@@ -61,7 +61,7 @@ ARABIC_MMLU_SUBJECTS = {
"economics"
:
"social_sciences"
,
"economics"
:
"social_sciences"
,
"arabic_language_(general)"
:
"language"
,
"arabic_language_(general)"
:
"language"
,
"arabic_language_(grammar)"
:
"language"
,
"arabic_language_(grammar)"
:
"language"
,
"civics"
:
"social_sciences"
"civics"
:
"social_sciences"
,
}
}
DATASETS
=
{
DATASETS
=
{
...
@@ -93,15 +93,16 @@ if __name__ == "__main__":
...
@@ -93,15 +93,16 @@ if __name__ == "__main__":
yaml_dict
=
{
yaml_dict
=
{
"include"
:
base_yaml_name
,
"include"
:
base_yaml_name
,
"tag"
:
[
f
"darijammlu_
{
category
}
_tasks"
,
"darijammlu_"
+
dataset
+
"_tasks"
],
"tag"
:
[
f
"darijammlu_
{
category
}
_tasks"
,
"darijammlu_"
+
dataset
+
"_tasks"
,
],
"task"
:
f
"darijammlu_
{
subject
}
"
,
"task"
:
f
"darijammlu_
{
subject
}
"
,
"task_alias"
:
subject
.
replace
(
"_"
,
" "
),
"task_alias"
:
subject
.
replace
(
"_"
,
" "
),
"dataset_name"
:
subject
,
"dataset_name"
:
subject
,
}
}
file_save_path
=
(
file_save_path
=
args
.
save_prefix_path
+
f
"_
{
subject
}
.yaml"
args
.
save_prefix_path
+
f
"_
{
subject
}
.yaml"
)
eval_logger
.
info
(
f
"Saving yaml for subset
{
subject
}
to
{
file_save_path
}
"
)
eval_logger
.
info
(
f
"Saving yaml for subset
{
subject
}
to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
yaml
.
dump
(
...
...
lm_eval/tasks/darijammlu/utils.py
View file @
ac0bc1df
...
@@ -5,7 +5,6 @@ alpha = ["A.", "B.", "C.", "D.", "E."]
...
@@ -5,7 +5,6 @@ alpha = ["A.", "B.", "C.", "D.", "E."]
def
doc_to_text
(
doc
):
def
doc_to_text
(
doc
):
subject
=
doc
[
"subject_darija"
]
subject
=
doc
[
"subject_darija"
]
question
=
(
question
=
(
doc
[
"question"
]
doc
[
"question"
]
...
@@ -23,4 +22,4 @@ def doc_to_text(doc):
...
@@ -23,4 +22,4 @@ def doc_to_text(doc):
def
doc_to_choice
(
doc
):
def
doc_to_choice
(
doc
):
return
[
alpha
[
i
][
0
]
for
i
in
range
(
len
(
doc
[
'
choices
'
]))]
return
[
alpha
[
i
][
0
]
for
i
in
range
(
len
(
doc
[
"
choices
"
]))]
lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml
View file @
ac0bc1df
...
@@ -24,3 +24,7 @@ task:
...
@@ -24,3 +24,7 @@ task:
-
leaderboard_bbh_tracking_shuffled_objects_seven_objects
-
leaderboard_bbh_tracking_shuffled_objects_seven_objects
-
leaderboard_bbh_tracking_shuffled_objects_three_objects
-
leaderboard_bbh_tracking_shuffled_objects_three_objects
-
leaderboard_bbh_web_of_lies
-
leaderboard_bbh_web_of_lies
aggregate_metric_list
:
-
metric
:
acc_norm
aggregation
:
mean
weight_by_size
:
true
lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml
View file @
ac0bc1df
...
@@ -3,3 +3,7 @@ task:
...
@@ -3,3 +3,7 @@ task:
-
leaderboard_gpqa_diamond
-
leaderboard_gpqa_diamond
-
leaderboard_gpqa_extended
-
leaderboard_gpqa_extended
-
leaderboard_gpqa_main
-
leaderboard_gpqa_main
aggregate_metric_list
:
-
metric
:
acc_norm
aggregation
:
mean
weight_by_size
:
true
lm_eval/tasks/leaderboard/leaderboard.yaml
View file @
ac0bc1df
...
@@ -6,3 +6,27 @@ task:
...
@@ -6,3 +6,27 @@ task:
-
leaderboard_math_hard
-
leaderboard_math_hard
-
leaderboard_ifeval
-
leaderboard_ifeval
-
leaderboard_musr
-
leaderboard_musr
aggregate_metric_list
:
-
metric
:
acc
aggregation
:
mean
weight_by_size
:
true
-
metric
:
acc_norm
aggregation
:
mean
weight_by_size
:
true
-
metric
:
exact_match
aggregation
:
mean
weight_by_size
:
true
-
metric
:
inst_level_loose_acc
aggregation
:
mean
weight_by_size
:
true
-
metric
:
inst_level_strict_acc
aggregation
:
mean
weight_by_size
:
true
-
metric
:
prompt_level_loose_acc
aggregation
:
mean
weight_by_size
:
true
-
metric
:
prompt_level_strict_acc
aggregation
:
mean
weight_by_size
:
true
metadata
:
version
:
1.0
lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml
View file @
ac0bc1df
...
@@ -7,3 +7,7 @@ task:
...
@@ -7,3 +7,7 @@ task:
-
leaderboard_math_num_theory_hard
-
leaderboard_math_num_theory_hard
-
leaderboard_math_prealgebra_hard
-
leaderboard_math_prealgebra_hard
-
leaderboard_math_precalculus_hard
-
leaderboard_math_precalculus_hard
aggregate_metric_list
:
-
metric
:
exact_match
aggregation
:
mean
weight_by_size
:
true
lm_eval/tasks/leaderboard/musr/_musr.yaml
View file @
ac0bc1df
...
@@ -3,3 +3,7 @@ task:
...
@@ -3,3 +3,7 @@ task:
-
leaderboard_musr_murder_mysteries
-
leaderboard_musr_murder_mysteries
-
leaderboard_musr_object_placements
-
leaderboard_musr_object_placements
-
leaderboard_musr_team_allocation
-
leaderboard_musr_team_allocation
aggregate_metric_list
:
-
metric
:
acc_norm
aggregation
:
mean
weight_by_size
:
true
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment