Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
3001569b
Commit
3001569b
authored
Jul 17, 2023
by
lintangsutawika
Browse files
prototype for aggregate
parent
53754d41
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
4 deletions
+12
-4
lm_eval/evaluator.py
lm_eval/evaluator.py
+12
-4
No files found.
lm_eval/evaluator.py
View file @
3001569b
...
@@ -190,7 +190,7 @@ def evaluate(
...
@@ -190,7 +190,7 @@ def evaluate(
configs
=
collections
.
defaultdict
(
dict
)
configs
=
collections
.
defaultdict
(
dict
)
samples
=
collections
.
defaultdict
(
list
)
samples
=
collections
.
defaultdict
(
list
)
requests
=
collections
.
defaultdict
(
list
)
requests
=
collections
.
defaultdict
(
list
)
aggregate
=
collections
.
defaultdict
(
dict
)
padding_requests
=
collections
.
defaultdict
(
int
)
padding_requests
=
collections
.
defaultdict
(
int
)
# get lists of each type of request
# get lists of each type of request
...
@@ -356,14 +356,18 @@ def evaluate(
...
@@ -356,14 +356,18 @@ def evaluate(
vals
=
vals_torch
vals
=
vals_torch
# Add Aggregation Here
if
lm
.
rank
==
0
:
if
lm
.
rank
==
0
:
### Aggregate results over all datapoints ###
### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
# aggregate results ; run bootstrap CIs
for
(
task_name
,
key
,
metric
),
items
in
vals
.
items
():
for
(
task_name
,
key
,
metric
),
items
in
vals
.
items
():
task
=
task_dict
[
task_name
]
task
=
task_dict
[
task_name
]
results
[
task_name
][
metric
+
","
+
key
]
=
task
.
aggregation
()[
metric
](
items
)
task_score
=
task
.
aggregation
()[
metric
](
items
)
results
[
task_name
][
metric
+
","
+
key
]
=
task_score
if
metric
not
in
aggregate
:
aggregate
[
metric
]
=
[
task_score
]
else
:
aggregate
[
metric
].
append
(
task_score
)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
# so we run them less iterations. still looking for a cleaner way to do this
...
@@ -378,8 +382,12 @@ def evaluate(
...
@@ -378,8 +382,12 @@ def evaluate(
if
stderr
is
not
None
:
if
stderr
is
not
None
:
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
for
metric
in
aggregate
.
keys
():
aggregate
[
metric
]
=
np
.
average
(
aggregate
[
metric
])
results_dict
=
{
results_dict
=
{
"results"
:
dict
(
results
),
"results"
:
dict
(
results
),
"aggregate"
:
dict
(
aggregate
),
"configs"
:
dict
(
configs
),
"configs"
:
dict
(
configs
),
"versions"
:
dict
(
versions
),
"versions"
:
dict
(
versions
),
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment