Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
31019847
Commit
31019847
authored
Mar 16, 2024
by
h-albert-lee
Browse files
Merge remote-tracking branch 'origin/main' into 1442-inverse-scaling-tasks
parents
10c9d5de
dc90fecc
Changes
52
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
297 additions
and
8 deletions
+297
-8
lm_eval/tasks/agieval/sat-en.yaml
lm_eval/tasks/agieval/sat-en.yaml
+7
-0
lm_eval/tasks/agieval/sat-math.yaml
lm_eval/tasks/agieval/sat-math.yaml
+7
-0
lm_eval/tasks/agieval/utils.py
lm_eval/tasks/agieval/utils.py
+274
-0
lm_eval/tasks/bigbench/generate_until_template_yaml
lm_eval/tasks/bigbench/generate_until_template_yaml
+1
-1
lm_eval/tasks/code_x_glue/code-text/go.yaml
lm_eval/tasks/code_x_glue/code-text/go.yaml
+1
-1
lm_eval/tasks/code_x_glue/code-text/java.yaml
lm_eval/tasks/code_x_glue/code-text/java.yaml
+1
-1
lm_eval/tasks/code_x_glue/code-text/javascript.yaml
lm_eval/tasks/code_x_glue/code-text/javascript.yaml
+1
-1
lm_eval/tasks/code_x_glue/code-text/php.yaml
lm_eval/tasks/code_x_glue/code-text/php.yaml
+1
-1
lm_eval/tasks/code_x_glue/code-text/python.yaml
lm_eval/tasks/code_x_glue/code-text/python.yaml
+1
-1
lm_eval/tasks/code_x_glue/code-text/ruby.yaml
lm_eval/tasks/code_x_glue/code-text/ruby.yaml
+1
-1
lm_eval/tasks/ifeval/ifeval.yaml
lm_eval/tasks/ifeval/ifeval.yaml
+1
-0
lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
...tasks/model_written_evals/advanced_ai_risk/_template_yaml
+1
-1
No files found.
lm_eval/tasks/agieval/sat-en.yaml
0 → 100644
View file @
31019847
include
:
aqua-rat.yaml
group
:
-
agieval
-
agieval_nous
-
agieval_en
task
:
agieval_sat_en
dataset_path
:
hails/agieval-sat-en
lm_eval/tasks/agieval/sat-math.yaml
0 → 100644
View file @
31019847
include
:
aqua-rat.yaml
group
:
-
agieval
-
agieval_nous
-
agieval_en
task
:
agieval_sat_math
dataset_path
:
hails/agieval-sat-math
lm_eval/tasks/agieval/utils.py
0 → 100644
View file @
31019847
# Answer parsing and normalization code, from
# https://github.com/ruixiangcui/AGIEval/blob/main/src/
# math_equivalence.py and post_process.py
import
re
from
typing
import
Dict
,
List
import
numpy
as
np
def
parse_math_answer
(
raw_string
):
def
remove_boxed
(
s
):
left
=
"
\\
boxed{"
try
:
assert
s
[:
len
(
left
)]
==
left
assert
s
[
-
1
]
==
"}"
answer
=
s
[
len
(
left
)
:
-
1
]
if
"="
in
answer
:
answer
=
answer
.
split
(
"="
)[
-
1
].
lstrip
(
" "
)
return
answer
except
Exception
:
return
None
def
last_boxed_only_string
(
string
):
idx
=
string
.
rfind
(
"
\\
boxed"
)
if
idx
<
0
:
idx
=
string
.
rfind
(
"
\\
fbox"
)
if
idx
<
0
:
return
None
i
=
idx
right_brace_idx
=
None
num_left_braces_open
=
0
while
i
<
len
(
string
):
if
string
[
i
]
==
"{"
:
num_left_braces_open
+=
1
if
string
[
i
]
==
"}"
:
num_left_braces_open
-=
1
if
num_left_braces_open
==
0
:
right_brace_idx
=
i
break
i
+=
1
if
right_brace_idx
is
None
:
retval
=
None
else
:
retval
=
string
[
idx
:
right_brace_idx
+
1
]
return
retval
def
get_answer_with_dollar_sign
(
s
):
first_pattern
=
"\$(.*)\$"
last_match
=
None
matches
=
re
.
findall
(
first_pattern
,
s
)
if
matches
:
last_match
=
matches
[
-
1
]
if
"="
in
last_match
:
last_match
=
last_match
.
split
(
"="
)[
-
1
].
lstrip
(
" "
)
return
last_match
def
get_answer_without_dollar_sign
(
s
):
last_match
=
None
if
"="
in
s
:
last_match
=
s
.
split
(
"="
)[
-
1
].
lstrip
(
" "
).
rstrip
(
"."
)
if
"
\\
n"
in
last_match
:
last_match
=
last_match
.
split
(
"
\\
n"
)[
0
]
else
:
pattern
=
"(?:
\\
$)?\d+(?:\.\d+)?(?![\w\d])"
matches
=
re
.
findall
(
pattern
,
s
)
if
matches
:
last_match
=
matches
[
-
1
]
return
last_match
if
"
\\
boxed"
in
raw_string
:
answer
=
remove_boxed
(
last_boxed_only_string
(
raw_string
))
else
:
answer
=
get_answer_with_dollar_sign
(
raw_string
)
if
not
answer
:
answer
=
get_answer_without_dollar_sign
(
raw_string
)
return
answer
# code from https://github.com/hendrycks/math/blob/main/modeling/math_equivalence.py
def
_fix_fracs
(
string
):
substrs
=
string
.
split
(
"
\\
frac"
)
new_str
=
substrs
[
0
]
if
len
(
substrs
)
>
1
:
substrs
=
substrs
[
1
:]
for
substr
in
substrs
:
new_str
+=
"
\\
frac"
if
substr
[
0
]
==
"{"
:
new_str
+=
substr
else
:
try
:
assert
len
(
substr
)
>=
2
except
Exception
:
return
string
a
=
substr
[
0
]
b
=
substr
[
1
]
if
b
!=
"{"
:
if
len
(
substr
)
>
2
:
post_substr
=
substr
[
2
:]
new_str
+=
"{"
+
a
+
"}{"
+
b
+
"}"
+
post_substr
else
:
new_str
+=
"{"
+
a
+
"}{"
+
b
+
"}"
else
:
if
len
(
substr
)
>
2
:
post_substr
=
substr
[
2
:]
new_str
+=
"{"
+
a
+
"}"
+
b
+
post_substr
else
:
new_str
+=
"{"
+
a
+
"}"
+
b
string
=
new_str
return
string
def
_fix_a_slash_b
(
string
):
if
len
(
string
.
split
(
"/"
))
!=
2
:
return
string
a
=
string
.
split
(
"/"
)[
0
]
b
=
string
.
split
(
"/"
)[
1
]
try
:
a
=
int
(
a
)
b
=
int
(
b
)
assert
string
==
"{}/{}"
.
format
(
a
,
b
)
new_string
=
"
\\
frac{"
+
str
(
a
)
+
"}{"
+
str
(
b
)
+
"}"
return
new_string
except
Exception
:
return
string
def
_remove_right_units
(
string
):
# "\\text{ " only ever occurs (at least in the val set) when describing units
if
"
\\
text{ "
in
string
:
splits
=
string
.
split
(
"
\\
text{ "
)
assert
len
(
splits
)
==
2
return
splits
[
0
]
else
:
return
string
def
_fix_sqrt
(
string
):
if
"
\\
sqrt"
not
in
string
:
return
string
splits
=
string
.
split
(
"
\\
sqrt"
)
new_string
=
splits
[
0
]
for
split
in
splits
[
1
:]:
if
split
[
0
]
!=
"{"
:
a
=
split
[
0
]
new_substr
=
"
\\
sqrt{"
+
a
+
"}"
+
split
[
1
:]
else
:
new_substr
=
"
\\
sqrt"
+
split
new_string
+=
new_substr
return
new_string
def
_strip_string
(
string
):
# linebreaks
string
=
string
.
replace
(
"
\n
"
,
""
)
# print(string)
# remove inverse spaces
string
=
string
.
replace
(
"
\\
!"
,
""
)
# print(string)
# replace \\ with \
string
=
string
.
replace
(
"
\\\\
"
,
"
\\
"
)
# print(string)
# replace tfrac and dfrac with frac
string
=
string
.
replace
(
"tfrac"
,
"frac"
)
string
=
string
.
replace
(
"dfrac"
,
"frac"
)
# print(string)
# remove \left and \right
string
=
string
.
replace
(
"
\\
left"
,
""
)
string
=
string
.
replace
(
"
\\
right"
,
""
)
# print(string)
# Remove circ (degrees)
string
=
string
.
replace
(
"^{
\\
circ}"
,
""
)
string
=
string
.
replace
(
"^
\\
circ"
,
""
)
# remove dollar signs
string
=
string
.
replace
(
"
\\
$"
,
""
)
# remove units (on the right)
string
=
_remove_right_units
(
string
)
# remove percentage
string
=
string
.
replace
(
"
\\
%"
,
""
)
string
=
string
.
replace
(
"\%"
,
""
)
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
string
=
string
.
replace
(
" ."
,
" 0."
)
string
=
string
.
replace
(
"{."
,
"{0."
)
# if empty, return empty string
if
len
(
string
)
==
0
:
return
string
if
string
[
0
]
==
"."
:
string
=
"0"
+
string
# to consider: get rid of e.g. "k = " or "q = " at beginning
if
len
(
string
.
split
(
"="
))
==
2
:
if
len
(
string
.
split
(
"="
)[
0
])
<=
2
:
string
=
string
.
split
(
"="
)[
1
]
# fix sqrt3 --> sqrt{3}
string
=
_fix_sqrt
(
string
)
# remove spaces
string
=
string
.
replace
(
" "
,
""
)
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
string
=
_fix_fracs
(
string
)
# manually change 0.5 --> \frac{1}{2}
if
string
==
"0.5"
:
string
=
"
\\
frac{1}{2}"
# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
string
=
_fix_a_slash_b
(
string
)
return
string
def
is_equiv
(
str1
,
str2
,
verbose
=
False
):
if
str1
is
None
and
str2
is
None
:
print
(
"WARNING: Both None"
)
return
True
if
str1
is
None
or
str2
is
None
:
return
False
str1
,
str2
=
parse_math_answer
(
str1
),
parse_math_answer
(
str2
)
try
:
ss1
=
_strip_string
(
str1
)
ss2
=
_strip_string
(
str2
)
if
verbose
:
print
(
ss1
,
ss2
)
return
ss1
==
ss2
except
Exception
:
return
str1
==
str2
def
process_results
(
doc
:
dict
,
results
:
List
[
str
])
->
Dict
[
str
,
int
]:
candidate
=
results
[
0
]
gold
=
doc
[
"answer"
]
if
not
gold
:
print
(
doc
,
candidate
,
gold
)
if
is_equiv
(
candidate
,
gold
):
retval
=
1
else
:
retval
=
0
results
=
{
"acc"
:
retval
,
}
return
results
# use a custom process_results() function, because AGIEval can have multiple valid answers
def
process_results_mcqa
(
doc
,
results
):
results
=
[
result
[
0
]
for
result
in
results
]
gold
=
doc
[
"gold"
]
acc
=
1.0
if
int
(
np
.
argmax
(
results
))
in
gold
else
0.0
completion_len
=
np
.
array
([
float
(
len
(
i
))
for
i
in
doc
[
"choices"
]])
acc_norm
=
1.0
if
int
(
np
.
argmax
(
results
/
completion_len
))
in
gold
else
0.0
return
{
"acc"
:
acc
,
"acc_norm"
:
acc_norm
,
}
lm_eval/tasks/bigbench/generate_until_template_yaml
View file @
31019847
...
...
@@ -8,7 +8,7 @@ test_split: default
doc_to_text: inputs
doc_to_target: "{{targets[0]}}"
generation_kwargs:
max_
l
en
gth
: 128
max_
g
en
_toks
: 128
metric_list:
- metric: exact_match
aggregation: mean
...
...
lm_eval/tasks/code_x_glue/code-text/go.yaml
View file @
31019847
...
...
@@ -8,7 +8,7 @@ test_split: test
output_type
:
generate_until
generation_kwargs
:
num_beams
:
10
max_
l
en
gth
:
128
max_
g
en
_toks
:
128
until
:
-
"
</s>"
doc_to_text
:
!function
utils.doc_to_text
...
...
lm_eval/tasks/code_x_glue/code-text/java.yaml
View file @
31019847
...
...
@@ -8,7 +8,7 @@ test_split: test
output_type
:
generate_until
generation_kwargs
:
num_beams
:
10
max_
l
en
gth
:
128
max_
g
en
_toks
:
128
until
:
-
"
</s>"
doc_to_text
:
!function
utils.doc_to_text
...
...
lm_eval/tasks/code_x_glue/code-text/javascript.yaml
View file @
31019847
...
...
@@ -8,7 +8,7 @@ test_split: test
output_type
:
generate_until
generation_kwargs
:
num_beams
:
10
max_
l
en
gth
:
128
max_
g
en
_toks
:
128
until
:
-
"
</s>"
doc_to_text
:
!function
utils.doc_to_text
...
...
lm_eval/tasks/code_x_glue/code-text/php.yaml
View file @
31019847
...
...
@@ -8,7 +8,7 @@ test_split: test
output_type
:
generate_until
generation_kwargs
:
num_beams
:
10
max_
l
en
gth
:
128
max_
g
en
_toks
:
128
until
:
-
"
</s>"
doc_to_text
:
!function
utils.doc_to_text
...
...
lm_eval/tasks/code_x_glue/code-text/python.yaml
View file @
31019847
...
...
@@ -8,7 +8,7 @@ test_split: test
output_type
:
generate_until
generation_kwargs
:
num_beams
:
10
max_
l
en
gth
:
128
max_
g
en
_toks
:
128
until
:
-
"
</s>"
doc_to_text
:
!function
utils.doc_to_text
...
...
lm_eval/tasks/code_x_glue/code-text/ruby.yaml
View file @
31019847
...
...
@@ -8,7 +8,7 @@ test_split: test
output_type
:
generate_until
generation_kwargs
:
num_beams
:
10
max_
l
en
gth
:
128
max_
g
en
_toks
:
128
until
:
-
"
</s>"
doc_to_text
:
!function
utils.doc_to_text
...
...
lm_eval/tasks/ifeval/ifeval.yaml
View file @
31019847
...
...
@@ -12,6 +12,7 @@ generation_kwargs:
temperature
:
0.0
max_gen_toks
:
1280
process_results
:
!function
utils.process_results
num_fewshot
:
0
metric_list
:
-
metric
:
prompt_level_strict_acc
aggregation
:
mean
...
...
lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
View file @
31019847
...
...
@@ -3,7 +3,7 @@ dataset_path: EleutherAI/advanced_ai_risk
output_type: multiple_choice
validation_split: validation
target_delimiter: ""
doc_to_text: "Human: {question}\n\nAssistant:"
doc_to_text: "Human:
{
{question}
}
\n\nAssistant:"
doc_to_target: 0
doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
should_decontaminate: true
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment