Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
e0021a06
Commit
e0021a06
authored
Jul 22, 2025
by
Baber
Browse files
remove doc_to_choice in generation process_results
parent
90d44580
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
58 additions
and
17 deletions
+58
-17
lm_eval/api/task.py
lm_eval/api/task.py
+58
-17
No files found.
lm_eval/api/task.py
View file @
e0021a06
...
@@ -981,10 +981,6 @@ class ConfigurableTask(Task):
...
@@ -981,10 +981,6 @@ class ConfigurableTask(Task):
def
download
(
def
download
(
self
,
dataset_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
**
kwargs
self
,
dataset_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
**
kwargs
)
->
None
:
)
->
None
:
from
packaging.version
import
parse
as
vparse
if
dataset_kwargs
and
vparse
(
datasets
.
__version__
)
>=
vparse
(
"4.0.0"
):
dataset_kwargs
.
pop
(
"trust_remote_code"
,
None
)
if
isinstance
(
self
.
config
.
custom_dataset
,
Callable
):
if
isinstance
(
self
.
config
.
custom_dataset
,
Callable
):
eval_logger
.
warning
(
eval_logger
.
warning
(
f
"
{
self
.
config
.
task
}
: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager."
f
"
{
self
.
config
.
task
}
: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager."
...
@@ -1661,20 +1657,65 @@ class ConfigurableTask(Task):
...
@@ -1661,20 +1657,65 @@ class ConfigurableTask(Task):
elif
self
.
OUTPUT_TYPE
==
"generate_until"
:
elif
self
.
OUTPUT_TYPE
==
"generate_until"
:
gold
=
self
.
doc_to_target
(
doc
)
gold
=
self
.
doc_to_target
(
doc
)
result
=
results
[
0
]
result
=
results
[
0
]
if
self
.
config
.
doc_to_choice
is
not
None
:
# we expect multiple_targets to be a list.
# If you set doc_to_choice,
if
self
.
multiple_target
:
# it assumes that doc_to_target returns a number.
gold
=
list
(
gold
)
choices
=
self
.
doc_to_choice
(
doc
)
# TODO: handle this better
gold
=
choices
[
gold
]
elif
type
(
gold
)
is
not
type
(
result
)
and
not
(
"bypass"
in
self
.
_metric_fn_list
.
keys
()
or
isinstance
(
result
,
list
)
):
# cast gold to the same type as result
gold
=
type
(
result
)(
gold
)
for
metric
in
self
.
_metric_fn_list
.
keys
():
for
metric
in
self
.
_metric_fn_list
.
keys
():
try
:
if
self
.
multiple_target
:
result_score
=
self
.
_metric_fn_list
[
metric
](
# in the case where we have multiple targets,
references
=
[
gold
]
if
not
isinstance
(
gold
,
list
)
else
gold
,
# return true if any are true
predictions
=
[
result
],
# TODO: this may break for multipLe_target, non zero-or-1 metrics
**
self
.
_metric_fn_kwargs
[
metric
],
scores
=
[]
)
if
not
isinstance
(
gold
,
list
):
except
TypeError
:
# needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
# sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
result_score
=
self
.
_metric_fn_list
[
metric
]([
gold
,
result
])
# print(gold)
gold
=
[
gold
]
if
metric
==
"exact_match"
:
result
=
[
result
for
_
in
range
(
len
(
gold
))]
scores
=
self
.
_metric_fn_list
[
metric
](
references
=
gold
,
predictions
=
result
,
**
self
.
_metric_fn_kwargs
[
metric
],
)[
metric
]
result_score
=
1.0
if
scores
>
0.0
else
0.0
else
:
for
gold_option
in
gold
:
try
:
result_score
=
self
.
_metric_fn_list
[
metric
](
references
=
[
gold_option
],
predictions
=
[
result
],
**
self
.
_metric_fn_kwargs
[
metric
],
)
except
(
TypeError
):
# TODO: this is hacky and I don't want to do it
result_score
=
self
.
_metric_fn_list
[
metric
](
[
gold_option
,
result
]
)
if
isinstance
(
result_score
,
dict
):
# TODO: this handles the case where HF evaluate returns a dict.
result_score
=
result_score
[
metric
]
scores
.
append
(
result_score
)
if
any
(
scores
):
result_score
=
1.0
else
:
result_score
=
0.0
else
:
try
:
result_score
=
self
.
_metric_fn_list
[
metric
](
references
=
[
gold
],
predictions
=
[
result
],
**
self
.
_metric_fn_kwargs
[
metric
],
)
except
TypeError
:
# needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
result_score
=
self
.
_metric_fn_list
[
metric
]([
gold
,
result
])
if
isinstance
(
result_score
,
dict
):
if
isinstance
(
result_score
,
dict
):
# TODO: this handles the case where HF evaluate returns a dict.
# TODO: this handles the case where HF evaluate returns a dict.
# This allows for multiple metrics to be returned from the same function
# This allows for multiple metrics to be returned from the same function
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment