Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
b58e5556
Commit
b58e5556
authored
Jul 27, 2025
by
Baber
Browse files
Merge branch 'main' into tasklist
# Conflicts: # pyproject.toml
parents
6e1866f5
4f8195f1
Changes
340
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
265 additions
and
0 deletions
+265
-0
lm_eval/tasks/egymmlu/_default_egymmlu_template_yaml
lm_eval/tasks/egymmlu/_default_egymmlu_template_yaml
+15
-0
lm_eval/tasks/egymmlu/_egymmlu.yaml
lm_eval/tasks/egymmlu/_egymmlu.yaml
+10
-0
lm_eval/tasks/egymmlu/_egymmlu_ar_mmlu.yaml
lm_eval/tasks/egymmlu/_egymmlu_ar_mmlu.yaml
+9
-0
lm_eval/tasks/egymmlu/_egymmlu_mmlu.yaml
lm_eval/tasks/egymmlu/_egymmlu_mmlu.yaml
+9
-0
lm_eval/tasks/egymmlu/_generate_configs.py
lm_eval/tasks/egymmlu/_generate_configs.py
+117
-0
lm_eval/tasks/egymmlu/egymmlu_accounting.yaml
lm_eval/tasks/egymmlu/egymmlu_accounting.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_arabic_language.yaml
lm_eval/tasks/egymmlu/egymmlu_arabic_language.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_arabic_language_(general).yaml
lm_eval/tasks/egymmlu/egymmlu_arabic_language_(general).yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_arabic_language_(grammar).yaml
lm_eval/tasks/egymmlu/egymmlu_arabic_language_(grammar).yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_biology.yaml
lm_eval/tasks/egymmlu/egymmlu_biology.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_civics.yaml
lm_eval/tasks/egymmlu/egymmlu_civics.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_computer_science.yaml
lm_eval/tasks/egymmlu/egymmlu_computer_science.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_driving_test.yaml
lm_eval/tasks/egymmlu/egymmlu_driving_test.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_economics.yaml
lm_eval/tasks/egymmlu/egymmlu_economics.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_general_knowledge.yaml
lm_eval/tasks/egymmlu/egymmlu_general_knowledge.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_geography.yaml
lm_eval/tasks/egymmlu/egymmlu_geography.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_global_facts.yaml
lm_eval/tasks/egymmlu/egymmlu_global_facts.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_high_school_european_history.yaml
...l/tasks/egymmlu/egymmlu_high_school_european_history.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_high_school_geography.yaml
lm_eval/tasks/egymmlu/egymmlu_high_school_geography.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_high_school_government_and_politics.yaml
.../egymmlu/egymmlu_high_school_government_and_politics.yaml
+7
-0
No files found.
lm_eval/tasks/egymmlu/_default_egymmlu_template_yaml
0 → 100644
View file @
b58e5556
dataset_path: UBC-NLP/EgyMMLU
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_choice: !function utils.doc_to_choice
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
lm_eval/tasks/egymmlu/_egymmlu.yaml
0 → 100644
View file @
b58e5556
group
:
egymmlu
group_alias
:
EgyMMLU
task
:
-
egymmlu_mmlu
-
egymmlu_ar_mmlu
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
0
lm_eval/tasks/egymmlu/_egymmlu_ar_mmlu.yaml
0 → 100644
View file @
b58e5556
group
:
egymmlu_ar_mmlu
group_alias
:
ArabicMMLU
task
:
-
egymmlu_ar_mmlu_tasks
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
0
lm_eval/tasks/egymmlu/_egymmlu_mmlu.yaml
0 → 100644
View file @
b58e5556
group
:
egymmlu_mmlu
group_alias
:
MMLU
task
:
-
egymmlu_mmlu_tasks
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
0
lm_eval/tasks/egymmlu/_generate_configs.py
0 → 100644
View file @
b58e5556
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import
argparse
import
logging
import
os
import
yaml
from
tqdm
import
tqdm
eval_logger
=
logging
.
getLogger
(
"lm-eval"
)
MMLU_SUBJECTS
=
{
"global_facts"
:
"other"
,
"high_school_european_history"
:
"humanities"
,
"high_school_geography"
:
"social_sciences"
,
"high_school_government_and_politics"
:
"social_sciences"
,
"high_school_psychology"
:
"social_sciences"
,
"high_school_statistics"
:
"stem"
,
"high_school_world_history"
:
"humanities"
,
"human_aging"
:
"other"
,
"international_law"
:
"humanities"
,
"jurisprudence"
:
"humanities"
,
"logical_fallacies"
:
"humanities"
,
"management"
:
"other"
,
"marketing"
:
"other"
,
"moral_disputes"
:
"humanities"
,
"moral_scenarios"
:
"humanities"
,
"nutrition"
:
"other"
,
"philosophy"
:
"humanities"
,
"professional_law"
:
"humanities"
,
"professional_psychology"
:
"social_sciences"
,
"public_relations"
:
"social_sciences"
,
"security_studies"
:
"social_sciences"
,
"sociology"
:
"social_sciences"
,
"world_religions"
:
"humanities"
,
}
ARABIC_MMLU_SUBJECTS
=
{
"islamic_studies"
:
"humanities"
,
"driving_test"
:
"other"
,
"natural_science"
:
"stem"
,
"history"
:
"humanities"
,
"general_knowledge"
:
"other"
,
"law"
:
"humanities"
,
"physics"
:
"stem"
,
"social_science"
:
"social_sciences"
,
"management_ar"
:
"other"
,
"arabic_language"
:
"language"
,
"political_science"
:
"social_sciences"
,
"philosophy_ar"
:
"humanities"
,
"accounting"
:
"social_sciences"
,
"computer_science"
:
"stem"
,
"geography"
:
"social_sciences"
,
"math"
:
"stem"
,
"biology"
:
"stem"
,
"economics"
:
"social_sciences"
,
"arabic_language_(general)"
:
"language"
,
"arabic_language_(grammar)"
:
"language"
,
"civics"
:
"social_sciences"
,
}
DATASETS
=
{
"mmlu"
:
MMLU_SUBJECTS
,
"ar_mmlu"
:
ARABIC_MMLU_SUBJECTS
,
}
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--base_yaml_path"
,
default
=
"_default_egymmlu_template_yaml"
)
parser
.
add_argument
(
"--save_prefix_path"
,
default
=
"egymmlu"
)
return
parser
.
parse_args
()
if
__name__
==
"__main__"
:
args
=
parse_args
()
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
ALL_CATEGORIES
=
[]
for
dataset
,
SUBJECTS
in
DATASETS
.
items
():
for
subject
,
category
in
tqdm
(
SUBJECTS
.
items
()):
if
category
not
in
ALL_CATEGORIES
:
ALL_CATEGORIES
.
append
(
category
)
yaml_dict
=
{
"include"
:
base_yaml_name
,
"tag"
:
[
f
"egymmlu_
{
category
}
_tasks"
,
"egymmlu_"
+
dataset
+
"_tasks"
,
],
"task"
:
f
"egymmlu_
{
subject
}
"
,
"task_alias"
:
subject
.
replace
(
"_"
,
" "
),
"dataset_name"
:
subject
,
}
file_save_path
=
args
.
save_prefix_path
+
f
"_
{
subject
}
.yaml"
eval_logger
.
info
(
f
"Saving yaml for subset
{
subject
}
to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
yaml_dict
,
yaml_file
,
allow_unicode
=
True
,
default_style
=
'"'
,
)
egymmlu_subcategories
=
[
f
"egymmlu_
{
category
}
"
for
category
in
ALL_CATEGORIES
]
file_save_path
=
args
.
save_prefix_path
+
".yaml"
eval_logger
.
info
(
f
"Saving benchmark config to
{
file_save_path
}
"
)
lm_eval/tasks/egymmlu/egymmlu_accounting.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
accounting"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_social_sciences_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_accounting"
"
task_alias"
:
"
accounting"
lm_eval/tasks/egymmlu/egymmlu_arabic_language.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
arabic_language"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_language_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_arabic_language"
"
task_alias"
:
"
arabic
language"
lm_eval/tasks/egymmlu/egymmlu_arabic_language_(general).yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
arabic_language_(general)"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_language_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_arabic_language_(general)"
"
task_alias"
:
"
arabic
language
(general)"
lm_eval/tasks/egymmlu/egymmlu_arabic_language_(grammar).yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
arabic_language_(grammar)"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_language_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_arabic_language_(grammar)"
"
task_alias"
:
"
arabic
language
(grammar)"
lm_eval/tasks/egymmlu/egymmlu_biology.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
biology"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_stem_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_biology"
"
task_alias"
:
"
biology"
lm_eval/tasks/egymmlu/egymmlu_civics.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
civics"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_social_sciences_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_civics"
"
task_alias"
:
"
civics"
lm_eval/tasks/egymmlu/egymmlu_computer_science.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
computer_science"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_stem_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_computer_science"
"
task_alias"
:
"
computer
science"
lm_eval/tasks/egymmlu/egymmlu_driving_test.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
driving_test"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_other_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_driving_test"
"
task_alias"
:
"
driving
test"
lm_eval/tasks/egymmlu/egymmlu_economics.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
economics"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_social_sciences_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_economics"
"
task_alias"
:
"
economics"
lm_eval/tasks/egymmlu/egymmlu_general_knowledge.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
general_knowledge"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_other_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_general_knowledge"
"
task_alias"
:
"
general
knowledge"
lm_eval/tasks/egymmlu/egymmlu_geography.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
geography"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_social_sciences_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_geography"
"
task_alias"
:
"
geography"
lm_eval/tasks/egymmlu/egymmlu_global_facts.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
global_facts"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_other_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_global_facts"
"
task_alias"
:
"
global
facts"
lm_eval/tasks/egymmlu/egymmlu_high_school_european_history.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
high_school_european_history"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_humanities_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_high_school_european_history"
"
task_alias"
:
"
high
school
european
history"
lm_eval/tasks/egymmlu/egymmlu_high_school_geography.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
high_school_geography"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_social_sciences_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_high_school_geography"
"
task_alias"
:
"
high
school
geography"
lm_eval/tasks/egymmlu/egymmlu_high_school_government_and_politics.yaml
0 → 100644
View file @
b58e5556
"
dataset_name"
:
"
high_school_government_and_politics"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_social_sciences_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_high_school_government_and_politics"
"
task_alias"
:
"
high
school
government
and
politics"
Prev
1
2
3
4
5
6
7
8
9
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment