Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
60c9c170
Commit
60c9c170
authored
May 29, 2024
by
haileyschoelkopf
Browse files
Merge branch 'main' into inverse-scaling-tasks
parents
4b2d565b
b4cd85d4
Changes
605
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
319 additions
and
9 deletions
+319
-9
lm_eval/tasks/hendrycks_math/hendrycks_math_counting_and_prob.yaml
...asks/hendrycks_math/hendrycks_math_counting_and_prob.yaml
+3
-0
lm_eval/tasks/hendrycks_math/hendrycks_math_geometry.yaml
lm_eval/tasks/hendrycks_math/hendrycks_math_geometry.yaml
+3
-0
lm_eval/tasks/hendrycks_math/hendrycks_math_intermediate_algebra.yaml
...s/hendrycks_math/hendrycks_math_intermediate_algebra.yaml
+3
-0
lm_eval/tasks/hendrycks_math/hendrycks_math_num_theory.yaml
lm_eval/tasks/hendrycks_math/hendrycks_math_num_theory.yaml
+3
-0
lm_eval/tasks/hendrycks_math/hendrycks_math_prealgebra.yaml
lm_eval/tasks/hendrycks_math/hendrycks_math_prealgebra.yaml
+3
-0
lm_eval/tasks/hendrycks_math/hendrycks_math_precalc.yaml
lm_eval/tasks/hendrycks_math/hendrycks_math_precalc.yaml
+3
-0
lm_eval/tasks/hendrycks_math/utils.py
lm_eval/tasks/hendrycks_math/utils.py
+231
-0
lm_eval/tasks/ifeval/ifeval.yaml
lm_eval/tasks/ifeval/ifeval.yaml
+0
-1
lm_eval/tasks/minerva_math/README.md
lm_eval/tasks/minerva_math/README.md
+2
-7
lm_eval/tasks/mmlu/_generate_configs.py
lm_eval/tasks/mmlu/_generate_configs.py
+3
-1
lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
+11
-0
lm_eval/tasks/mmlu/continuation/_mmlu.yaml
lm_eval/tasks/mmlu/continuation/_mmlu.yaml
+6
-0
lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml
lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml
+6
-0
lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml
lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml
+6
-0
lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml
lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml
+6
-0
lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml
lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml
+6
-0
lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml
lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml
+6
-0
lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml
lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml
+6
-0
lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml
lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml
+6
-0
lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml
...asks/mmlu/continuation/mmlu_college_computer_science.yaml
+6
-0
No files found.
lm_eval/tasks/hendrycks_math/hendrycks_math_counting_and_prob.yaml
0 → 100644
View file @
60c9c170
include
:
hendrycks_math_algebra.yaml
dataset_name
:
counting_and_probability
task
:
hendrycks_math_counting_and_prob
lm_eval/tasks/hendrycks_math/hendrycks_math_geometry.yaml
0 → 100644
View file @
60c9c170
include
:
hendrycks_math_algebra.yaml
dataset_name
:
geometry
task
:
hendrycks_math_geometry
lm_eval/tasks/hendrycks_math/hendrycks_math_intermediate_algebra.yaml
0 → 100644
View file @
60c9c170
include
:
hendrycks_math_algebra.yaml
dataset_name
:
intermediate_algebra
task
:
hendrycks_math_intermediate_algebra
lm_eval/tasks/hendrycks_math/hendrycks_math_num_theory.yaml
0 → 100644
View file @
60c9c170
include
:
hendrycks_math_algebra.yaml
dataset_name
:
number_theory
task
:
hendrycks_math_num_theory
lm_eval/tasks/hendrycks_math/hendrycks_math_prealgebra.yaml
0 → 100644
View file @
60c9c170
include
:
hendrycks_math_algebra.yaml
dataset_name
:
prealgebra
task
:
hendrycks_math_prealgebra
lm_eval/tasks/hendrycks_math/hendrycks_math_precalc.yaml
0 → 100644
View file @
60c9c170
include
:
hendrycks_math_algebra.yaml
dataset_name
:
precalculus
task
:
hendrycks_math_precalc
lm_eval/tasks/hendrycks_math/utils.py
0 → 100644
View file @
60c9c170
from
typing
import
Dict
,
List
import
datasets
def
process_docs
(
dataset
:
datasets
.
Dataset
)
->
datasets
.
Dataset
:
def
_process_doc
(
doc
:
dict
)
->
dict
:
out_doc
=
{
"problem"
:
doc
[
"problem"
],
"solution"
:
doc
[
"solution"
],
"answer"
:
remove_boxed
(
last_boxed_only_string
(
doc
[
"solution"
])),
}
return
out_doc
return
dataset
.
map
(
_process_doc
)
def
process_results
(
doc
:
dict
,
results
:
List
[
str
])
->
Dict
[
str
,
int
]:
retval
=
0
indices
=
[
pos
for
pos
,
char
in
enumerate
(
results
[
0
])
if
char
==
"$"
]
if
len
(
indices
)
<=
1
:
answer
=
results
[
0
]
else
:
answer
=
results
[
0
][
indices
[
0
]
+
1
:
indices
[
-
1
]]
if
is_equiv
(
answer
,
remove_boxed
(
last_boxed_only_string
(
doc
[
"solution"
]))):
retval
=
1
results
=
{
"exact_match"
:
retval
,
}
return
results
# string normalization from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
def
is_equiv
(
str1
,
str2
,
verbose
=
False
):
if
str1
is
None
and
str2
is
None
:
print
(
"WARNING: Both None"
)
return
True
if
str1
is
None
or
str2
is
None
:
return
False
try
:
ss1
=
strip_string
(
str1
)
ss2
=
strip_string
(
str2
)
if
verbose
:
print
(
ss1
,
ss2
)
return
ss1
==
ss2
except
Exception
:
return
str1
==
str2
def
remove_boxed
(
s
):
if
"
\\
boxed "
in
s
:
left
=
"
\\
boxed "
assert
s
[:
len
(
left
)]
==
left
return
s
[
len
(
left
)
:]
left
=
"
\\
boxed{"
assert
s
[:
len
(
left
)]
==
left
assert
s
[
-
1
]
==
"}"
return
s
[
len
(
left
)
:
-
1
]
def
last_boxed_only_string
(
string
):
idx
=
string
.
rfind
(
"
\\
boxed"
)
if
"
\\
boxed "
in
string
:
return
"
\\
boxed "
+
string
.
split
(
"
\\
boxed "
)[
-
1
].
split
(
"$"
)[
0
]
if
idx
<
0
:
idx
=
string
.
rfind
(
"
\\
fbox"
)
if
idx
<
0
:
return
None
i
=
idx
right_brace_idx
=
None
num_left_braces_open
=
0
while
i
<
len
(
string
):
if
string
[
i
]
==
"{"
:
num_left_braces_open
+=
1
if
string
[
i
]
==
"}"
:
num_left_braces_open
-=
1
if
num_left_braces_open
==
0
:
right_brace_idx
=
i
break
i
+=
1
if
right_brace_idx
is
None
:
retval
=
None
else
:
retval
=
string
[
idx
:
right_brace_idx
+
1
]
return
retval
def
fix_fracs
(
string
):
substrs
=
string
.
split
(
"
\\
frac"
)
new_str
=
substrs
[
0
]
if
len
(
substrs
)
>
1
:
substrs
=
substrs
[
1
:]
for
substr
in
substrs
:
new_str
+=
"
\\
frac"
if
substr
[
0
]
==
"{"
:
new_str
+=
substr
else
:
try
:
assert
len
(
substr
)
>=
2
except
AssertionError
:
return
string
a
=
substr
[
0
]
b
=
substr
[
1
]
if
b
!=
"{"
:
if
len
(
substr
)
>
2
:
post_substr
=
substr
[
2
:]
new_str
+=
"{"
+
a
+
"}{"
+
b
+
"}"
+
post_substr
else
:
new_str
+=
"{"
+
a
+
"}{"
+
b
+
"}"
else
:
if
len
(
substr
)
>
2
:
post_substr
=
substr
[
2
:]
new_str
+=
"{"
+
a
+
"}"
+
b
+
post_substr
else
:
new_str
+=
"{"
+
a
+
"}"
+
b
string
=
new_str
return
string
def
fix_a_slash_b
(
string
):
if
len
(
string
.
split
(
"/"
))
!=
2
:
return
string
a
=
string
.
split
(
"/"
)[
0
]
b
=
string
.
split
(
"/"
)[
1
]
try
:
a
=
int
(
a
)
b
=
int
(
b
)
assert
string
==
"{}/{}"
.
format
(
a
,
b
)
new_string
=
"
\\
frac{"
+
str
(
a
)
+
"}{"
+
str
(
b
)
+
"}"
return
new_string
except
AssertionError
:
return
string
def
remove_right_units
(
string
):
# "\\text{ " only ever occurs (at least in the val set) when describing units
if
"
\\
text{ "
in
string
:
splits
=
string
.
split
(
"
\\
text{ "
)
assert
len
(
splits
)
==
2
return
splits
[
0
]
else
:
return
string
def
fix_sqrt
(
string
):
if
"
\\
sqrt"
not
in
string
:
return
string
splits
=
string
.
split
(
"
\\
sqrt"
)
new_string
=
splits
[
0
]
for
split
in
splits
[
1
:]:
if
split
[
0
]
!=
"{"
:
a
=
split
[
0
]
new_substr
=
"
\\
sqrt{"
+
a
+
"}"
+
split
[
1
:]
else
:
new_substr
=
"
\\
sqrt"
+
split
new_string
+=
new_substr
return
new_string
def
strip_string
(
string
):
# linebreaks
string
=
string
.
replace
(
"
\n
"
,
""
)
# remove inverse spaces
string
=
string
.
replace
(
"
\\
!"
,
""
)
# replace \\ with \
string
=
string
.
replace
(
"
\\\\
"
,
"
\\
"
)
# replace tfrac and dfrac with frac
string
=
string
.
replace
(
"tfrac"
,
"frac"
)
string
=
string
.
replace
(
"dfrac"
,
"frac"
)
# remove \left and \right
string
=
string
.
replace
(
"
\\
left"
,
""
)
string
=
string
.
replace
(
"
\\
right"
,
""
)
# Remove circ (degrees)
string
=
string
.
replace
(
"^{
\\
circ}"
,
""
)
string
=
string
.
replace
(
"^
\\
circ"
,
""
)
# remove dollar signs
string
=
string
.
replace
(
"
\\
$"
,
""
)
# remove units (on the right)
string
=
remove_right_units
(
string
)
# remove percentage
string
=
string
.
replace
(
"
\\
%"
,
""
)
string
=
string
.
replace
(
"\%"
,
""
)
# noqa: W605
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
string
=
string
.
replace
(
" ."
,
" 0."
)
string
=
string
.
replace
(
"{."
,
"{0."
)
# if empty, return empty string
if
len
(
string
)
==
0
:
return
string
if
string
[
0
]
==
"."
:
string
=
"0"
+
string
# to consider: get rid of e.g. "k = " or "q = " at beginning
if
len
(
string
.
split
(
"="
))
==
2
:
if
len
(
string
.
split
(
"="
)[
0
])
<=
2
:
string
=
string
.
split
(
"="
)[
1
]
# fix sqrt3 --> sqrt{3}
string
=
fix_sqrt
(
string
)
# remove spaces
string
=
string
.
replace
(
" "
,
""
)
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
string
=
fix_fracs
(
string
)
# manually change 0.5 --> \frac{1}{2}
if
string
==
"0.5"
:
string
=
"
\\
frac{1}{2}"
# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
string
=
fix_a_slash_b
(
string
)
return
string
lm_eval/tasks/ifeval/ifeval.yaml
View file @
60c9c170
...
@@ -12,7 +12,6 @@ generation_kwargs:
...
@@ -12,7 +12,6 @@ generation_kwargs:
temperature
:
0.0
temperature
:
0.0
max_gen_toks
:
1280
max_gen_toks
:
1280
process_results
:
!function
utils.process_results
process_results
:
!function
utils.process_results
num_fewshot
:
0
metric_list
:
metric_list
:
-
metric
:
prompt_level_strict_acc
-
metric
:
prompt_level_strict_acc
aggregation
:
mean
aggregation
:
mean
...
...
lm_eval/tasks/minerva_math/README.md
View file @
60c9c170
...
@@ -28,16 +28,11 @@ Eprint = {arXiv:2206.14858},
...
@@ -28,16 +28,11 @@ Eprint = {arXiv:2206.14858},
}
}
```
```
### Groups, Benchmarks and Tasks
### Groups and Tasks
#### Benchmarks
-
`minerva_math`
#### Groups
#### Groups
-
`math_word_problems`
-
`minerva_math`
-
`generate_until`
#### Tasks
#### Tasks
...
...
lm_eval/tasks/mmlu/_generate_configs.py
View file @
60c9c170
...
@@ -2,12 +2,14 @@
...
@@ -2,12 +2,14 @@
Take in a YAML, and output all "other" splits with this YAML
Take in a YAML, and output all "other" splits with this YAML
"""
"""
import
argparse
import
argparse
import
logging
import
os
import
os
import
yaml
import
yaml
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
lm_eval.logger
import
eval_logger
eval_logger
=
logging
.
getLogger
(
"lm-eval"
)
SUBJECTS
=
{
SUBJECTS
=
{
...
...
lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
0 → 100644
View file @
60c9c170
dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
output_type: multiple_choice
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
doc_to_text: "Question: {{question.strip()}}\nAnswer:"
doc_to_choice: "{{choices}}"
doc_to_target: "{{answer}}"
metadata:
version: 0.0
lm_eval/tasks/mmlu/continuation/_mmlu.yaml
0 → 100644
View file @
60c9c170
group
:
mmlu_continuation
task
:
-
mmlu_continuation_stem
-
mmlu_continuation_other
-
mmlu_continuation_social_sciences
-
mmlu_continuation_humanities
lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml
0 → 100644
View file @
60c9c170
"
dataset_name"
:
"
abstract_algebra"
"
description"
:
"
The
following
are
questions
(with
answers)
about
abstract
\
\
algebra.
\n\n
"
"
group"
:
"
mmlu_continuation_stem"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation_abstract_algebra"
lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml
0 → 100644
View file @
60c9c170
"
dataset_name"
:
"
anatomy"
"
description"
:
"
The
following
are
questions
(with
answers)
about
anatomy.
\n\
\n
"
"
group"
:
"
mmlu_continuation_stem"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation_anatomy"
lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml
0 → 100644
View file @
60c9c170
"
dataset_name"
:
"
astronomy"
"
description"
:
"
The
following
are
questions
(with
answers)
about
astronomy.
\n\
\n
"
"
group"
:
"
mmlu_continuation_stem"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation_astronomy"
lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml
0 → 100644
View file @
60c9c170
"
dataset_name"
:
"
business_ethics"
"
description"
:
"
The
following
are
questions
(with
answers)
about
business
\
\
ethics.
\n\n
"
"
group"
:
"
mmlu_continuation_other"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation_business_ethics"
lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml
0 → 100644
View file @
60c9c170
"
dataset_name"
:
"
clinical_knowledge"
"
description"
:
"
The
following
are
questions
(with
answers)
about
clinical
\
\
knowledge.
\n\n
"
"
group"
:
"
mmlu_continuation_other"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation_clinical_knowledge"
lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml
0 → 100644
View file @
60c9c170
"
dataset_name"
:
"
college_biology"
"
description"
:
"
The
following
are
questions
(with
answers)
about
college
\
\
biology.
\n\n
"
"
group"
:
"
mmlu_continuation_stem"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation_college_biology"
lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml
0 → 100644
View file @
60c9c170
"
dataset_name"
:
"
college_chemistry"
"
description"
:
"
The
following
are
questions
(with
answers)
about
college
\
\
chemistry.
\n\n
"
"
group"
:
"
mmlu_continuation_stem"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation_college_chemistry"
lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml
0 → 100644
View file @
60c9c170
"
dataset_name"
:
"
college_computer_science"
"
description"
:
"
The
following
are
questions
(with
answers)
about
college
\
\
computer
science.
\n\n
"
"
group"
:
"
mmlu_continuation_stem"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation_college_computer_science"
Prev
1
…
13
14
15
16
17
18
19
20
21
…
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment