Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
bd028848
Commit
bd028848
authored
Jul 18, 2025
by
Baber
Browse files
Merge branch 'main' into metrics
# Conflicts: # tests/test_tasks.py
parents
6e48110e
56def33d
Changes
108
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
129 additions
and
2 deletions
+129
-2
lm_eval/tasks/egymmlu/egymmlu_moral_scenarios.yaml
lm_eval/tasks/egymmlu/egymmlu_moral_scenarios.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_natural_science.yaml
lm_eval/tasks/egymmlu/egymmlu_natural_science.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_nutrition.yaml
lm_eval/tasks/egymmlu/egymmlu_nutrition.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_philosophy.yaml
lm_eval/tasks/egymmlu/egymmlu_philosophy.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_philosophy_ar.yaml
lm_eval/tasks/egymmlu/egymmlu_philosophy_ar.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_physics.yaml
lm_eval/tasks/egymmlu/egymmlu_physics.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_political_science.yaml
lm_eval/tasks/egymmlu/egymmlu_political_science.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_professional_law.yaml
lm_eval/tasks/egymmlu/egymmlu_professional_law.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_professional_psychology.yaml
lm_eval/tasks/egymmlu/egymmlu_professional_psychology.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_public_relations.yaml
lm_eval/tasks/egymmlu/egymmlu_public_relations.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_security_studies.yaml
lm_eval/tasks/egymmlu/egymmlu_security_studies.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_social_science.yaml
lm_eval/tasks/egymmlu/egymmlu_social_science.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_sociology.yaml
lm_eval/tasks/egymmlu/egymmlu_sociology.yaml
+7
-0
lm_eval/tasks/egymmlu/egymmlu_world_religions.yaml
lm_eval/tasks/egymmlu/egymmlu_world_religions.yaml
+7
-0
lm_eval/tasks/egymmlu/utils.py
lm_eval/tasks/egymmlu/utils.py
+25
-0
lm_eval/tasks/evalita_llm/_evalita-mp_ner_adg
lm_eval/tasks/evalita_llm/_evalita-mp_ner_adg
+0
-0
lm_eval/tasks/evalita_llm/_evalita-mp_ner_fic
lm_eval/tasks/evalita_llm/_evalita-mp_ner_fic
+0
-0
lm_eval/tasks/evalita_llm/_evalita-mp_ner_wn
lm_eval/tasks/evalita_llm/_evalita-mp_ner_wn
+0
-0
lm_eval/tasks/meddialog/utils.py
lm_eval/tasks/meddialog/utils.py
+3
-1
lm_eval/tasks/mediqa_qa2019/utils.py
lm_eval/tasks/mediqa_qa2019/utils.py
+3
-1
No files found.
lm_eval/tasks/egymmlu/egymmlu_moral_scenarios.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
moral_scenarios"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_humanities_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_moral_scenarios"
"
task_alias"
:
"
moral
scenarios"
lm_eval/tasks/egymmlu/egymmlu_natural_science.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
natural_science"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_stem_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_natural_science"
"
task_alias"
:
"
natural
science"
lm_eval/tasks/egymmlu/egymmlu_nutrition.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
nutrition"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_other_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_nutrition"
"
task_alias"
:
"
nutrition"
lm_eval/tasks/egymmlu/egymmlu_philosophy.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
philosophy"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_humanities_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_philosophy"
"
task_alias"
:
"
philosophy"
lm_eval/tasks/egymmlu/egymmlu_philosophy_ar.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
philosophy_ar"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_humanities_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_philosophy_ar"
"
task_alias"
:
"
philosophy
ar"
lm_eval/tasks/egymmlu/egymmlu_physics.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
physics"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_stem_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_physics"
"
task_alias"
:
"
physics"
lm_eval/tasks/egymmlu/egymmlu_political_science.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
political_science"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_social_sciences_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_political_science"
"
task_alias"
:
"
political
science"
lm_eval/tasks/egymmlu/egymmlu_professional_law.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
professional_law"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_humanities_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_professional_law"
"
task_alias"
:
"
professional
law"
lm_eval/tasks/egymmlu/egymmlu_professional_psychology.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
professional_psychology"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_social_sciences_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_professional_psychology"
"
task_alias"
:
"
professional
psychology"
lm_eval/tasks/egymmlu/egymmlu_public_relations.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
public_relations"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_social_sciences_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_public_relations"
"
task_alias"
:
"
public
relations"
lm_eval/tasks/egymmlu/egymmlu_security_studies.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
security_studies"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_social_sciences_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_security_studies"
"
task_alias"
:
"
security
studies"
lm_eval/tasks/egymmlu/egymmlu_social_science.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
social_science"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_social_sciences_tasks"
-
"
egymmlu_ar_mmlu_tasks"
"
task"
:
"
egymmlu_social_science"
"
task_alias"
:
"
social
science"
lm_eval/tasks/egymmlu/egymmlu_sociology.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
sociology"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_social_sciences_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_sociology"
"
task_alias"
:
"
sociology"
lm_eval/tasks/egymmlu/egymmlu_world_religions.yaml
0 → 100644
View file @
bd028848
"
dataset_name"
:
"
world_religions"
"
include"
:
"
_default_egymmlu_template_yaml"
"
tag"
:
-
"
egymmlu_humanities_tasks"
-
"
egymmlu_mmlu_tasks"
"
task"
:
"
egymmlu_world_religions"
"
task_alias"
:
"
world
religions"
lm_eval/tasks/egymmlu/utils.py
0 → 100644
View file @
bd028848
PROMPT
=
"ده سؤال متعدد الاختيار (مع إجابته) على {}
\n\n
{}
\n
{}
\n
الجواب:"
alpha
=
[
"A."
,
"B."
,
"C."
,
"D."
,
"E."
]
def
doc_to_text
(
doc
):
subject
=
doc
[
"egy_subject"
]
# subject_egyptian
question
=
(
doc
[
"question"
]
if
doc
[
"context"
]
==
""
else
f
"
{
doc
[
'context'
]
}
\n\n
{
doc
[
'question'
]
}
"
)
options
=
[]
for
i
,
opt
in
enumerate
(
doc
[
"choices"
]):
options
.
append
(
f
"
{
alpha
[
i
]
}
{
opt
}
"
)
doc_text
=
PROMPT
.
format
(
subject
,
question
,
"
\n
"
.
join
(
options
))
return
doc_text
def
doc_to_choice
(
doc
):
return
[
alpha
[
i
][
0
]
for
i
in
range
(
len
(
doc
[
"choices"
]))]
lm_eval/tasks/evalita_llm/_evalita-mp_ner_adg
.yaml
→
lm_eval/tasks/evalita_llm/_evalita-mp_ner_adg
View file @
bd028848
File moved
lm_eval/tasks/evalita_llm/_evalita-mp_ner_fic
.yaml
→
lm_eval/tasks/evalita_llm/_evalita-mp_ner_fic
View file @
bd028848
File moved
lm_eval/tasks/evalita_llm/_evalita-mp_ner_wn
.yaml
→
lm_eval/tasks/evalita_llm/_evalita-mp_ner_wn
View file @
bd028848
File moved
lm_eval/tasks/meddialog/utils.py
View file @
bd028848
...
...
@@ -11,7 +11,9 @@ try:
except
(
ModuleNotFoundError
,
ImportError
):
raise
ModuleNotFoundError
(
"Please install evaluation metrics via pip install evaluate and pip install bert-score"
,
"Please install evaluation metrics via pip install evaluate bert-score "
"rouge_score>=0.1.2 nltk absl-py "
"git+https://github.com/google-research/bleurt.git"
)
except
Exception
as
e
:
raise
RuntimeError
(
...
...
lm_eval/tasks/mediqa_qa2019/utils.py
View file @
bd028848
...
...
@@ -11,7 +11,9 @@ try:
except
(
ModuleNotFoundError
,
ImportError
):
raise
ModuleNotFoundError
(
"Please install evaluation metrics via pip install evaluate and pip install bert-score"
,
"Please install evaluation metrics via pip install evaluate bert-score "
"rouge_score>=0.1.2 nltk absl-py "
"git+https://github.com/google-research/bleurt.git"
)
except
Exception
as
e
:
raise
RuntimeError
(
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment