Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
acf454b7
Commit
acf454b7
authored
May 14, 2025
by
Baber
Browse files
modify evaluator metrics to calcualte each repeat
parent
28001d29
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
7 additions
and
5 deletions
+7
-5
lm_eval/api/task.py
lm_eval/api/task.py
+1
-1
lm_eval/evaluator.py
lm_eval/evaluator.py
+6
-4
No files found.
lm_eval/api/task.py
View file @
acf454b7
...
@@ -884,7 +884,7 @@ class ConfigurableTask(Task):
...
@@ -884,7 +884,7 @@ class ConfigurableTask(Task):
eval_logger
.
debug
(
eval_logger
.
debug
(
"No custom filters defined. Using default 'take_first' filter for handling repeats."
"No custom filters defined. Using default 'take_first' filter for handling repeats."
)
)
self
.
_filters
=
[
build_filter_ensemble
(
"none"
,
[[
"take_first"
,
None
]])]
#
self._filters = [build_filter_ensemble("none", [["take_first", None]])]
if
self
.
config
.
use_prompt
is
not
None
:
if
self
.
config
.
use_prompt
is
not
None
:
eval_logger
.
info
(
f
"loading prompt
{
self
.
config
.
use_prompt
}
"
)
eval_logger
.
info
(
f
"loading prompt
{
self
.
config
.
use_prompt
}
"
)
...
...
lm_eval/evaluator.py
View file @
acf454b7
...
@@ -613,9 +613,11 @@ def evaluate(
...
@@ -613,9 +613,11 @@ def evaluate(
else
:
else
:
doc_id_true
=
doc_id
doc_id_true
=
doc_id
requests
=
instances_by_doc_id
[
doc_id
]
requests
=
instances_by_doc_id
[
doc_id
]
metrics
=
task
.
process_results
(
metrics
:
list
[
dict
]
=
[
doc
,
[
req
.
filtered_resps
[
filter_key
]
for
req
in
requests
]
task
.
process_results
(
doc
,
response
)
)
for
req
in
requests
for
response
in
req
.
filtered_resps
[
filter_key
]
]
if
log_samples
:
if
log_samples
:
target
=
task
.
doc_to_target
(
doc
)
target
=
task
.
doc_to_target
(
doc
)
example
=
{
example
=
{
...
@@ -628,7 +630,7 @@ def evaluate(
...
@@ -628,7 +630,7 @@ def evaluate(
req
.
filtered_resps
[
filter_key
]
for
req
in
requests
req
.
filtered_resps
[
filter_key
]
for
req
in
requests
],
],
"filter"
:
filter_key
,
"filter"
:
filter_key
,
"metrics"
:
list
(
m
et
rics
.
keys
()),
"metrics"
:
list
(
s
et
(
m
.
keys
()
for
m
in
metrics
)
),
"doc_hash"
:
hash_string
(
"doc_hash"
:
hash_string
(
json
.
dumps
(
json
.
dumps
(
requests
[
0
].
doc
,
requests
[
0
].
doc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment