Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
3e8135ce
Commit
3e8135ce
authored
Sep 16, 2025
by
Baber
Browse files
Merge branch 'main' into comma
parents
8e560c96
0c134ee9
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
95 additions
and
89 deletions
+95
-89
lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+1
-1
lm_eval/tasks/minerva_math/utils.py
lm_eval/tasks/minerva_math/utils.py
+8
-5
lm_eval/tasks/mlqa/README.md
lm_eval/tasks/mlqa/README.md
+50
-50
lm_eval/tasks/mmlu/README.md
lm_eval/tasks/mmlu/README.md
+3
-0
lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
+1
-1
lm_eval/tasks/mmlu/continuation/_mmlu.yaml
lm_eval/tasks/mmlu/continuation/_mmlu.yaml
+4
-4
lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml
lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml
lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml
lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml
lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml
lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml
lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml
lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml
...asks/mmlu/continuation/mmlu_college_computer_science.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_college_mathematics.yaml
...val/tasks/mmlu/continuation/mmlu_college_mathematics.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_college_medicine.yaml
lm_eval/tasks/mmlu/continuation/mmlu_college_medicine.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_college_physics.yaml
lm_eval/tasks/mmlu/continuation/mmlu_college_physics.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_computer_security.yaml
lm_eval/tasks/mmlu/continuation/mmlu_computer_security.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_conceptual_physics.yaml
lm_eval/tasks/mmlu/continuation/mmlu_conceptual_physics.yaml
+2
-2
lm_eval/tasks/mmlu/continuation/mmlu_econometrics.yaml
lm_eval/tasks/mmlu/continuation/mmlu_econometrics.yaml
+2
-2
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
View file @
3e8135ce
...
...
@@ -24,7 +24,7 @@ metric_list:
higher_is_better
:
true
num_fewshot
:
4
metadata
:
version
:
2
.0
version
:
3
.0
fewshot_config
:
sampler
:
first_n
samples
:
!function
utils.list_fewshot_samples
lm_eval/tasks/minerva_math/utils.py
View file @
3e8135ce
...
...
@@ -71,7 +71,7 @@ def list_fewshot_samples() -> list[dict]:
]
def
process_results
(
doc
:
dict
,
results
:
L
ist
[
str
])
->
D
ict
[
str
,
int
]:
def
process_results
(
doc
:
dict
,
results
:
l
ist
[
str
])
->
d
ict
[
str
,
int
]:
candidates
=
results
[
0
]
unnormalized_answer
=
get_unnormalized_answer
(
candidates
)
...
...
@@ -83,14 +83,17 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
retval
=
0
# math_verify
res
=
verify
(
parse
(
doc
[
"answer"
]),
parse
(
candidates
))
mathval
=
1
if
res
else
0
_mvres
=
verify
(
gold
=
parse
(
doc
[
"solution"
]),
target
=
parse
(
candidates
),
)
mathval
=
1
if
_mvres
else
0
res
ults
=
{
res
=
{
"exact_match"
:
retval
,
"math_verify"
:
mathval
,
}
return
res
ults
return
res
def
last_boxed_only_string
(
string
:
str
)
->
Optional
[
str
]:
...
...
lm_eval/tasks/mlqa/README.md
View file @
3e8135ce
...
...
@@ -36,56 +36,56 @@ Homepage: `https://github.com/facebookresearch/MLQA`
#### Tasks
Tasks of the form
`mlqa_context-lang_question-lang
.yaml
`
*
`mlqa_ar_ar
.yaml
`
*
`mlqa_ar_de
.yaml
`
*
`mlqa_ar_vi
.yaml
`
*
`mlqa_ar_zh
.yaml
`
*
`mlqa_ar_en
.yaml
`
*
`mlqa_ar_es
.yaml
`
*
`mlqa_ar_hi
.yaml
`
*
`mlqa_de_ar
.yaml
`
*
`mlqa_de_de
.yaml
`
*
`mlqa_de_vi
.yaml
`
*
`mlqa_de_zh
.yaml
`
*
`mlqa_de_en
.yaml
`
*
`mlqa_de_es
.yaml
`
*
`mlqa_de_hi
.yaml
`
*
`mlqa_vi_ar
.yaml
`
*
`mlqa_vi_de
.yaml
`
*
`mlqa_vi_vi
.yaml
`
*
`mlqa_vi_zh
.yaml
`
*
`mlqa_vi_en
.yaml
`
*
`mlqa_vi_es
.yaml
`
*
`mlqa_vi_hi
.yaml
`
*
`mlqa_zh_ar
.yaml
`
*
`mlqa_zh_de
.yaml
`
*
`mlqa_zh_vi
.yaml
`
*
`mlqa_zh_zh
.yaml
`
*
`mlqa_zh_en
.yaml
`
*
`mlqa_zh_es
.yaml
`
*
`mlqa_zh_hi
.yaml
`
*
`mlqa_en_ar
.yaml
`
*
`mlqa_en_de
.yaml
`
*
`mlqa_en_vi
.yaml
`
*
`mlqa_en_zh
.yaml
`
*
`mlqa_en_en
.yaml
`
*
`mlqa_en_es
.yaml
`
*
`mlqa_en_hi
.yaml
`
*
`mlqa_es_ar
.yaml
`
*
`mlqa_es_de
.yaml
`
*
`mlqa_es_vi
.yaml
`
*
`mlqa_es_zh
.yaml
`
*
`mlqa_es_en
.yaml
`
*
`mlqa_es_es
.yaml
`
*
`mlqa_es_hi
.yaml
`
*
`mlqa_hi_ar
.yaml
`
*
`mlqa_hi_de
.yaml
`
*
`mlqa_hi_vi
.yaml
`
*
`mlqa_hi_zh
.yaml
`
*
`mlqa_hi_en
.yaml
`
*
`mlqa_hi_es
.yaml
`
*
`mlqa_hi_hi
.yaml
`
Tasks of the form
`mlqa_context-lang_question-lang`
*
`mlqa_ar_ar`
*
`mlqa_ar_de`
*
`mlqa_ar_vi`
*
`mlqa_ar_zh`
*
`mlqa_ar_en`
*
`mlqa_ar_es`
*
`mlqa_ar_hi`
*
`mlqa_de_ar`
*
`mlqa_de_de`
*
`mlqa_de_vi`
*
`mlqa_de_zh`
*
`mlqa_de_en`
*
`mlqa_de_es`
*
`mlqa_de_hi`
*
`mlqa_vi_ar`
*
`mlqa_vi_de`
*
`mlqa_vi_vi`
*
`mlqa_vi_zh`
*
`mlqa_vi_en`
*
`mlqa_vi_es`
*
`mlqa_vi_hi`
*
`mlqa_zh_ar`
*
`mlqa_zh_de`
*
`mlqa_zh_vi`
*
`mlqa_zh_zh`
*
`mlqa_zh_en`
*
`mlqa_zh_es`
*
`mlqa_zh_hi`
*
`mlqa_en_ar`
*
`mlqa_en_de`
*
`mlqa_en_vi`
*
`mlqa_en_zh`
*
`mlqa_en_en`
*
`mlqa_en_es`
*
`mlqa_en_hi`
*
`mlqa_es_ar`
*
`mlqa_es_de`
*
`mlqa_es_vi`
*
`mlqa_es_zh`
*
`mlqa_es_en`
*
`mlqa_es_es`
*
`mlqa_es_hi`
*
`mlqa_hi_ar`
*
`mlqa_hi_de`
*
`mlqa_hi_vi`
*
`mlqa_hi_zh`
*
`mlqa_hi_en`
*
`mlqa_hi_es`
*
`mlqa_hi_hi`
### Checklist
...
...
lm_eval/tasks/mmlu/README.md
View file @
3e8135ce
...
...
@@ -71,3 +71,6 @@ switch to original implementation
ver 2: PR #2116
add missing newline in description.
PR #3137
Fix
`mmlu_continuation`
subgroup names to fit other variants, and switch dataset from
`hails/mmlu_no_train`
to
`cais/mmlu`
in all subtasks.
lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
View file @
3e8135ce
dataset_path:
hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
dataset_path:
cais/mmlu
output_type: multiple_choice
test_split: test
fewshot_split: dev
...
...
lm_eval/tasks/mmlu/continuation/_mmlu.yaml
View file @
3e8135ce
...
...
@@ -3,25 +3,25 @@ group_alias: mmlu (continuation)
task
:
-
group
:
stem
task
:
-
mmlu_continuation
_stem
-
mmlu_
stem_
continuation
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
other
task
:
-
mmlu_continuation
_other
-
mmlu_
other_
continuation
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
social sciences
task
:
-
mmlu_
continuation_
social_sciences
-
mmlu_social_sciences
_continuation
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
humanities
task
:
-
mmlu_continuation
_humanities
-
mmlu_
humanities_
continuation
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
...
...
lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml
View file @
3e8135ce
"
dataset_name"
:
"
abstract_algebra"
"
description"
:
"
The
following
are
questions
(with
answers)
about
abstract
\
\
algebra.
\n\n
"
"
tag"
:
"
mmlu_continuation
_stem
"
"
tag"
:
"
mmlu_
stem_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_
continuation_
abstract_algebra"
"
task"
:
"
mmlu_abstract_algebra
_continuation
"
lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml
View file @
3e8135ce
"
dataset_name"
:
"
anatomy"
"
description"
:
"
The
following
are
questions
(with
answers)
about
anatomy.
\n\
\n
"
"
tag"
:
"
mmlu_continuation
_stem
"
"
tag"
:
"
mmlu_
stem_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation
_anatomy
"
"
task"
:
"
mmlu_
anatomy_
continuation"
lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml
View file @
3e8135ce
"
dataset_name"
:
"
astronomy"
"
description"
:
"
The
following
are
questions
(with
answers)
about
astronomy.
\n\
\n
"
"
tag"
:
"
mmlu_continuation
_stem
"
"
tag"
:
"
mmlu_
stem_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation
_astronomy
"
"
task"
:
"
mmlu_
astronomy_
continuation"
lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml
View file @
3e8135ce
"
dataset_name"
:
"
business_ethics"
"
description"
:
"
The
following
are
questions
(with
answers)
about
business
\
\
ethics.
\n\n
"
"
tag"
:
"
mmlu_continuation
_other
"
"
tag"
:
"
mmlu_
other_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_
continuation_
business_ethics"
"
task"
:
"
mmlu_business_ethics
_continuation
"
lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml
View file @
3e8135ce
"
dataset_name"
:
"
clinical_knowledge"
"
description"
:
"
The
following
are
questions
(with
answers)
about
clinical
\
\
knowledge.
\n\n
"
"
tag"
:
"
mmlu_continuation
_other
"
"
tag"
:
"
mmlu_
other_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_
continuation_
clinical_knowledge"
"
task"
:
"
mmlu_clinical_knowledge
_continuation
"
lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml
View file @
3e8135ce
"
dataset_name"
:
"
college_biology"
"
description"
:
"
The
following
are
questions
(with
answers)
about
college
\
\
biology.
\n\n
"
"
tag"
:
"
mmlu_continuation
_stem
"
"
tag"
:
"
mmlu_
stem_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_co
ntinuation_co
llege_biology"
"
task"
:
"
mmlu_college_biology
_continuation
"
lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml
View file @
3e8135ce
"
dataset_name"
:
"
college_chemistry"
"
description"
:
"
The
following
are
questions
(with
answers)
about
college
\
\
chemistry.
\n\n
"
"
tag"
:
"
mmlu_continuation
_stem
"
"
tag"
:
"
mmlu_
stem_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_
continuation_
college_chemistry"
"
task"
:
"
mmlu_college_chemistry
_continuation
"
lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml
View file @
3e8135ce
"
dataset_name"
:
"
college_computer_science"
"
description"
:
"
The
following
are
questions
(with
answers)
about
college
\
\
computer
science.
\n\n
"
"
tag"
:
"
mmlu_continuation
_stem
"
"
tag"
:
"
mmlu_
stem_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_
continuation_
college_computer_science"
"
task"
:
"
mmlu_college_computer_science
_continuation
"
lm_eval/tasks/mmlu/continuation/mmlu_college_mathematics.yaml
View file @
3e8135ce
"
dataset_name"
:
"
college_mathematics"
"
description"
:
"
The
following
are
questions
(with
answers)
about
college
\
\
mathematics.
\n\n
"
"
tag"
:
"
mmlu_continuation
_stem
"
"
tag"
:
"
mmlu_
stem_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_
continuation_
college_mathematics"
"
task"
:
"
mmlu_college_mathematics
_continuation
"
lm_eval/tasks/mmlu/continuation/mmlu_college_medicine.yaml
View file @
3e8135ce
"
dataset_name"
:
"
college_medicine"
"
description"
:
"
The
following
are
questions
(with
answers)
about
college
\
\
medicine.
\n\n
"
"
tag"
:
"
mmlu_continuation
_other
"
"
tag"
:
"
mmlu_
other_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_
continuation_
college_medicine"
"
task"
:
"
mmlu_college_medicine
_continuation
"
lm_eval/tasks/mmlu/continuation/mmlu_college_physics.yaml
View file @
3e8135ce
"
dataset_name"
:
"
college_physics"
"
description"
:
"
The
following
are
questions
(with
answers)
about
college
\
\
physics.
\n\n
"
"
tag"
:
"
mmlu_continuation
_stem
"
"
tag"
:
"
mmlu_
stem_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_co
ntinuation_co
llege_physics"
"
task"
:
"
mmlu_college_physics
_continuation
"
lm_eval/tasks/mmlu/continuation/mmlu_computer_security.yaml
View file @
3e8135ce
"
dataset_name"
:
"
computer_security"
"
description"
:
"
The
following
are
questions
(with
answers)
about
computer
\
\
security.
\n\n
"
"
tag"
:
"
mmlu_continuation
_stem
"
"
tag"
:
"
mmlu_
stem_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_
continuation_
computer_security"
"
task"
:
"
mmlu_computer_security
_continuation
"
lm_eval/tasks/mmlu/continuation/mmlu_conceptual_physics.yaml
View file @
3e8135ce
"
dataset_name"
:
"
conceptual_physics"
"
description"
:
"
The
following
are
questions
(with
answers)
about
conceptual
\
\
physics.
\n\n
"
"
tag"
:
"
mmlu_continuation
_stem
"
"
tag"
:
"
mmlu_
stem_
continuation"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_
continuation_
conceptual_physics"
"
task"
:
"
mmlu_conceptual_physics
_continuation
"
lm_eval/tasks/mmlu/continuation/mmlu_econometrics.yaml
View file @
3e8135ce
"
dataset_name"
:
"
econometrics"
"
description"
:
"
The
following
are
questions
(with
answers)
about
econometrics.
\n\
\n
"
"
tag"
:
"
mmlu_
continuation_
social_sciences"
"
tag"
:
"
mmlu_social_sciences
_continuation
"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_
continuation_econometrics
"
"
task"
:
"
mmlu_
econometrics_continuation
"
Prev
1
…
12
13
14
15
16
17
18
19
20
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment