Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
5b159bf9
Commit
5b159bf9
authored
Jan 10, 2025
by
Baber
Browse files
test humaneval
parent
173b2bc3
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
28 additions
and
24 deletions
+28
-24
lm_eval/api/task.py
lm_eval/api/task.py
+3
-3
lm_eval/tasks/humaneval/humaneval.yaml
lm_eval/tasks/humaneval/humaneval.yaml
+3
-2
lm_eval/tasks/humaneval/utils.py
lm_eval/tasks/humaneval/utils.py
+22
-19
No files found.
lm_eval/api/task.py
View file @
5b159bf9
...
...
@@ -1503,9 +1503,9 @@ class ConfigurableTask(Task):
# we expect multiple_targets to be a list.
elif
self
.
multiple_target
:
gold
=
list
(
gold
)
elif
(
type
(
gold
)
is
not
type
(
result
)
and
"bypass"
not
in
self
.
_metric_fn_list
.
keys
()
# TODO: handle this better
elif
type
(
gold
)
is
not
type
(
result
)
and
not
(
"bypass"
in
self
.
_metric_fn_list
.
keys
()
or
isinstance
(
result
,
list
)
):
# cast gold to the same type as result
gold
=
type
(
result
)(
gold
)
...
...
lm_eval/tasks/humaneval/humaneval.yaml
View file @
5b159bf9
...
...
@@ -3,11 +3,12 @@ dataset_path: openai/openai_humaneval
output_type
:
generate_until
test_split
:
test
doc_to_text
:
"
{{prompt}}"
doc_to_target
:
!function
utils.build_references
doc_to_target
:
"
{{test}}
\n
check({{entry_point}})"
metric_list
:
-
metric
:
!function
utils.pass_at_1
aggregation
:
mean
higher_is_better
:
true
k
:
64
generation_kwargs
:
until
:
-
"
\n
class"
...
...
@@ -18,7 +19,7 @@ generation_kwargs:
do_sample
:
true
temperature
:
0.2
top_p
:
0.95
repeats
:
64
repeats
:
2
num_fewshot
:
0
filter_list
:
-
name
:
"
n=64"
# number of samples to estimate pass@k
...
...
lm_eval/tasks/humaneval/utils.py
View file @
5b159bf9
import
evaluate
as
hf_evaluate
pass_at_k
=
hf_evaluate
.
load
(
"code_eval"
)
# run simple test to check code execution is enabled before model generation
test_cases
=
[
"assert add(2, 3)==5"
]
candidates
=
[[
"def add(a,b): return a*b"
]]
results
=
pass_at_k
.
compute
(
references
=
test_cases
,
predictions
=
candidates
,
k
=
[
1
])
def
pass_at_1
(
references
,
predictions
):
return
pass_at_k
.
compute
(
# pass_at_k = hf_evaluate.load("code_eval")
#
# # run simple test to check code execution is enabled before model generation
# test_cases = ["assert add(2, 3)==5"]
# candidates = [["def add(a,b): return a*b"]]
# results = pass_at_k.compute(references=test_cases, predictions=candidates, k=[1])
def
pass_at_1
(
references
:
list
[
str
],
predictions
:
list
[
list
[
str
]],
k
:
list
[
int
]
=
None
):
pass_at_k
=
hf_evaluate
.
load
(
"code_eval"
)
assert
k
is
not
None
if
isinstance
(
k
,
int
):
k
=
[
k
]
res
=
pass_at_k
.
compute
(
references
=
references
,
predictions
=
predictions
,
k
=
[
1
],
)[
0
][
"pass@1"
]
k
=
k
,
)[
0
]
return
{
key
:
val
for
key
,
val
in
res
.
items
()
if
key
in
map
(
lambda
x
:
f
"pass@
{
x
}
"
,
k
)
}
def
build_references
(
doc
):
return
doc
[
"test"
]
+
"
\n
"
+
f
"check(
{
doc
[
'entry_point'
]
}
)"
def
build_predictions
(
resps
,
docs
):
preds
=
[]
for
resp
,
doc
in
zip
(
resps
,
docs
):
pred
=
[
doc
[
"prompt"
]
+
r
for
r
in
resp
]
preds
.
append
(
pred
)
return
preds
def
build_predictions
(
resps
:
list
[
list
[
str
]],
docs
:
list
[
dict
])
->
list
[
list
[
str
]]:
return
[[
doc
[
"prompt"
]
+
r
for
r
in
resp
]
for
resp
,
doc
in
zip
(
resps
,
docs
)]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment