Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
18a7c8b1
"docs/zh_cn/vscode:/vscode.git/clone" did not exist on "4f88f1a59f3569005401d25126a487cb5219e436"
Commit
18a7c8b1
authored
Jul 17, 2023
by
lintangsutawika
Browse files
attach group identifier to task for aggregation
parent
ef7588b6
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
30 additions
and
7 deletions
+30
-7
lm_eval/evaluator.py
lm_eval/evaluator.py
+25
-6
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+5
-1
No files found.
lm_eval/evaluator.py
View file @
18a7c8b1
...
...
@@ -191,10 +191,21 @@ def evaluate(
samples
=
collections
.
defaultdict
(
list
)
requests
=
collections
.
defaultdict
(
list
)
aggregate
=
collections
.
defaultdict
(
dict
)
task_groups
=
collections
.
defaultdict
(
dict
)
padding_requests
=
collections
.
defaultdict
(
int
)
# get lists of each type of request
for
task_name
,
task
in
task_dict
.
items
():
if
type
(
task
)
==
tuple
:
group
,
task
=
task
# if group in task_groups:
# task_groups[group].append(task_name)
# else:
# task_groups[group] = [task_name]
task_groups
[
task_name
]
=
group
versions
[
task_name
]
=
task
.
VERSION
configs
[
task_name
]
=
dict
(
task
.
dump_config
())
...
...
@@ -269,6 +280,8 @@ def evaluate(
### Postprocess outputs ###
# TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
for
task_name
,
task
in
task_dict
.
items
():
if
type
(
task
)
==
tuple
:
group
,
task
=
task
task
.
apply_filters
()
### Collect values of metrics on all datapoints ###
...
...
@@ -276,6 +289,8 @@ def evaluate(
# unpack results and sort back in order and return control to Task
for
task_name
,
task
in
task_dict
.
items
():
if
type
(
task
)
==
tuple
:
group
,
task
=
task
# TODO: make it possible to use a different metric per filter
# iterate over different filters used
for
key
in
task
.
instances
[
0
].
filtered_resps
.
keys
():
...
...
@@ -361,6 +376,8 @@ def evaluate(
# aggregate results ; run bootstrap CIs
for
(
task_name
,
key
,
metric
),
items
in
vals
.
items
():
task
=
task_dict
[
task_name
]
if
type
(
task
)
==
tuple
:
group
,
task
=
task
task_score
=
task
.
aggregation
()[
metric
](
items
)
results
[
task_name
][
metric
+
","
+
key
]
=
task_score
...
...
@@ -373,10 +390,11 @@ def evaluate(
# | word_perplexity
# | byte_perplexity
# | bits_per_byte
if
metric
not
in
aggregate
:
aggregate
[
metric
]
=
[
task_score
]
group_name
=
task_groups
[
task_name
]
if
metric
not
in
aggregate
[
group_name
]:
aggregate
[
group_name
][
metric
]
=
[
task_score
]
else
:
aggregate
[
metric
].
append
(
task_score
)
aggregate
[
group_name
][
metric
].
append
(
task_score
)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
...
...
@@ -391,9 +409,10 @@ def evaluate(
if
stderr
is
not
None
:
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
for
metric
in
aggregate
.
keys
():
results
[
"Aggregate"
][
metric
]
=
np
.
average
(
aggregate
[
metric
])
versions
[
"Aggregate"
]
=
"N/A"
for
group
in
aggregate
.
keys
():
for
metric
in
aggregate
[
group
].
keys
():
aggregate
[
group
][
metric
]
=
np
.
average
(
aggregate
[
group
][
metric
])
versions
[
group
]
=
"N/A"
results_dict
=
{
"results"
:
dict
(
results
),
...
...
lm_eval/tasks/__init__.py
View file @
18a7c8b1
...
...
@@ -128,11 +128,15 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
if
isinstance
(
task_element
,
str
):
if
task_element
in
GROUP_REGISTRY
:
group_name
=
task_element
for
task_name
in
GROUP_REGISTRY
[
task_element
]:
if
task_name
not
in
task_name_from_registry_dict
:
task_name_from_registry_dict
=
{
**
task_name_from_registry_dict
,
task_name
:
get_task
(
task_name
=
task_name
,
config
=
config
),
task_name
:
(
group_name
,
get_task
(
task_name
=
task_name
,
config
=
config
),
),
}
else
:
task_name
=
task_element
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment