Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
52192906
"docs/source/en/api/outputs.md" did not exist on "856dad57bb7a9ee13af4a08492e524b0a145a2c5"
Commit
52192906
authored
Sep 16, 2025
by
Baber
Browse files
update mbpp
parent
13aa5096
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
51 additions
and
46 deletions
+51
-46
lm_eval/tasks/mbpp/mbpp.yaml
lm_eval/tasks/mbpp/mbpp.yaml
+18
-7
lm_eval/tasks/mbpp/mbpp_evalplus.yaml
lm_eval/tasks/mbpp/mbpp_evalplus.yaml
+12
-11
lm_eval/tasks/mbpp/mbpp_instruct.yaml
lm_eval/tasks/mbpp/mbpp_instruct.yaml
+3
-2
lm_eval/tasks/mbpp/utils.py
lm_eval/tasks/mbpp/utils.py
+18
-26
No files found.
lm_eval/tasks/mbpp/mbpp.yaml
View file @
52192906
...
...
@@ -4,20 +4,31 @@ dataset_name: full
unsafe_code
:
true
output_type
:
generate_until
test_split
:
test
doc_to_text
:
"
You
are
an
expert
Python
programmer,
and
here
is
your
task:
{{text}}
Your
code
should
pass
these
tests:
\n\n
{{test_list[0]}}
\n
{{test_list[1]}}
\n
{{test_list[2]}}
\n
[BEGIN]
\n
"
doc_to_target
:
"
{%
if
is_fewshot
is
defined
%}{{code}}
\n
[DONE]{%
else
%}{{test_list[0]}}
\n
{{test_list[1]}}
\n
{{test_list[2]}}{%
endif
%}"
repeats
:
20
doc_to_text
:
"
{{text|trim}}
\n
{{code}}.split(':')[0]:
\n
"
doc_to_target
:
"
{%
if
is_fewshot
is
defined
%}{{code}}
\n
{%
else
%}{{test_list[0]}}
\n
{{test_list[1]}}
\n
{{test_list[2]}}{%
endif
%}"
target_delimiter
:
"
"
gen_prefix
:
"
Here
is
the
completed
function:
\n\n
```python
\n
"
metric_list
:
-
metric
:
!function
utils.pass_at_
1
-
metric
:
!function
utils.pass_at_
k
aggregation
:
mean
higher_is_better
:
true
k
:
[
10
]
generation_kwargs
:
until
:
-
"
[DONE]"
until
:
[
"
\n
class"
,
"
\n
assert"
,
'
\n"""'
,
"
\n
print"
,
"
\n
if"
,
"
\n
```"
,
"
\n
#"
,
"
\n
<|/"
,
"
<|eot_id|>"
,
]
do_sample
:
false
num_fewshot
:
3
fewshot_config
:
sampler
:
first_n
samples
:
!function
utils.list_fewshot_samples
metadata
:
version
:
1
.0
version
:
2
.0
lm_eval/tasks/mbpp/mbpp_evalplus.yaml
View file @
52192906
...
...
@@ -17,9 +17,10 @@ doc_to_target: "{% if is_fewshot is defined %}{{code}}\n[DONE]{% else %}{{test_l
target_delimiter
:
"
"
gen_prefix
:
"
Here
is
the
completed
function:
\n\n
```python
\n
"
metric_list
:
-
metric
:
!function
utils.pass_at_
10
-
metric
:
!function
utils.pass_at_
k
aggregation
:
mean
higher_is_better
:
true
k
:
[
10
]
filter_list
:
-
name
:
"
create_test"
filter
:
...
...
@@ -27,16 +28,16 @@ filter_list:
filter_fn
:
!function
utils.build_predictions
generation_kwargs
:
until
:
[
"
\n
class"
,
"
\n
assert"
,
'
\n"""'
,
"
\n
print"
,
"
\n
if"
,
"
\n
```"
,
"
\n
#"
,
"
\n
<|/"
,
"
<|eot_id|>"
,
]
"
\n
class"
,
"
\n
assert"
,
'
\n"""'
,
"
\n
print"
,
"
\n
if"
,
"
\n
```"
,
"
\n
#"
,
"
\n
<|/"
,
"
<|eot_id|>"
,
]
do_sample
:
true
temperature
:
0.8
top_p
:
0.95
...
...
lm_eval/tasks/mbpp/mbpp_instruct.yaml
View file @
52192906
...
...
@@ -9,9 +9,10 @@ doc_to_target: "{% if is_fewshot is defined %}{{code}}\n```{% else %}{{test_list
gen_prefix
:
"
\n
```python
\n
"
target_delimiter
:
"
"
metric_list
:
-
metric
:
!function
utils.pass_at_
1
-
metric
:
!function
utils.pass_at_
k
aggregation
:
mean
higher_is_better
:
true
k
:
[
1
]
filter_list
:
-
name
:
"
extract_code"
filter
:
...
...
@@ -19,7 +20,7 @@ filter_list:
filter_fn
:
!function
utils.build_predictions
generation_kwargs
:
max_gen_toks
:
256
until
:
[]
until
:
[
]
do_sample
:
false
num_fewshot
:
3
fewshot_config
:
...
...
lm_eval/tasks/mbpp/utils.py
View file @
52192906
...
...
@@ -5,44 +5,36 @@ import evaluate as hf_evaluate
try
:
pass_at_k
=
hf_evaluate
.
load
(
"code_eval"
)
# run simple test to check code execution is enabled before model generation
compute_
=
hf_evaluate
.
load
(
"code_eval"
)
test_cases
=
[
"assert add(2, 3)==5"
]
candidates
=
[[
"def add(a,b): return a*b"
]]
results
=
pass_at_k
.
compute
(
references
=
test_cases
,
predictions
=
candidates
,
k
=
[
1
])
results
=
compute_
.
compute
(
references
=
test_cases
,
predictions
=
candidates
,
k
=
[
1
])
except
Exception
as
e
:
raise
e
def
pass_at_1
(
references
:
Union
[
str
,
list
[
str
]],
predictions
:
Union
[
str
,
list
[
list
[
str
]]]
)
->
float
:
if
isinstance
(
references
,
str
):
references
=
[
references
]
if
isinstance
(
predictions
[
0
],
str
):
predictions
=
[[
p
]
for
p
in
predictions
]
return
pass_at_k
.
compute
(
def
pass_at_k
(
references
:
list
[
str
],
predictions
:
list
[
list
[
str
]],
k
:
list
[
int
]
=
None
):
global
compute_
assert
k
is
not
None
if
isinstance
(
k
,
int
):
k
=
[
k
]
res
=
compute_
.
compute
(
references
=
references
,
predictions
=
predictions
,
k
=
[
1
],
)[
0
][
"pass@1"
]
def
pass_at_10
(
references
:
Union
[
str
,
list
[
str
]],
predictions
:
Union
[
str
,
list
[
list
[
str
]]]
)
->
float
:
global
pass_at_k
if
isinstance
(
references
,
str
):
references
=
[
references
]
if
isinstance
(
predictions
[
0
],
str
):
predictions
=
[[
p
]
for
p
in
predictions
]
res
=
pass_at_k
.
compute
(
references
=
references
,
predictions
=
predictions
,
k
=
[
10
],
num_workers
=
20
k
=
k
,
)
return
res
[
0
]
def
extract_python_block
(
text
:
str
)
->
str
:
if
not
text
.
startswith
(
"```"
):
text
=
"```python
\n
"
+
text
+
"
\n
```"
# capture only fences whose language tag is 'python'
pattern
=
re
.
compile
(
r
"```python\n([\s\S]*?)\n?```"
,
re
.
IGNORECASE
)
m
=
pattern
.
search
(
text
)
return
"from __future__ import annotations
\n
"
+
m
.
group
(
1
)
if
m
else
""
def
extract_code_blocks
(
text
:
str
)
->
str
:
# Pattern to match ```...``` blocks
ignore_annotations
=
"from __future__ import annotations
\n
"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment