Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
d4f62844
Commit
d4f62844
authored
Sep 13, 2023
by
lintangsutawika
Browse files
better presentation
parent
e1e05b19
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
18 additions
and
42 deletions
+18
-42
lm_eval/evaluator.py
lm_eval/evaluator.py
+18
-42
No files found.
lm_eval/evaluator.py
View file @
d4f62844
...
@@ -443,46 +443,17 @@ def evaluate(
...
@@ -443,46 +443,17 @@ def evaluate(
if
stderr
is
not
None
:
if
stderr
is
not
None
:
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
# zero_order_groups = [group for group in task_hierarchy if task_hierarchy[group] == 0]
# for task_name, task in task_dict.items():
# if type(task) == tuple:
# group_name, _ = task
# else:
# group_name = None
# scores = results[task_name]
# if group_name is not None:
# group_name = tab_dict[group_name] * "-" + group_name
# if group_name not in results_agg:
# results_agg[group_name] = {}
# for metric in scores:
# if metric in results_agg[group_name]:
# results_agg[group_name][metric].append(scores[metric])
# else:
# results_agg[group_name][metric] = [scores[metric]]
# tab_task_name = tab_dict[task_name] * "-" + task_name
# results_agg[tab_task_name] = scores
# versions[tab_task_name] = versions[task_name]
# if bool(results_agg):
# for group in results_agg.keys():
# for metric in results_agg[group].keys():
# results_agg[group][metric] = np.average(results_agg[group][metric])
# versions[group] = "N/A"
if
bool
(
results
):
if
bool
(
results
):
for
task_or_group
in
results
.
keys
():
for
task_or_group
in
results
.
keys
():
for
metric
in
results
[
task_or_group
].
keys
():
for
metric
in
results
[
task_or_group
].
keys
():
try
:
print
(
task_or_group
,
metric
,
len
(
results
[
task_or_group
][
metric
]))
except
:
pass
if
type
(
results
[
task_or_group
][
metric
])
==
list
:
if
type
(
results
[
task_or_group
][
metric
])
==
list
:
results
[
task_or_group
][
metric
]
=
np
.
average
(
results
[
task_or_group
][
metric
])
results
[
task_or_group
][
metric
]
=
np
.
average
(
results
[
task_or_group
][
metric
])
versions
[
task_or_group
]
=
"N/A"
versions
[
task_or_group
]
=
"N/A"
print
(
"task_hierarchy"
)
print
(
task_hierarchy
)
print
(
"--"
)
for
group
in
task_hierarchy
.
keys
():
for
group
in
task_hierarchy
.
keys
():
if
group
not
in
task_order
:
if
group
not
in
task_order
:
task_order
[
group
]
=
0
task_order
[
group
]
=
0
...
@@ -493,15 +464,20 @@ def evaluate(
...
@@ -493,15 +464,20 @@ def evaluate(
else
:
else
:
task_order
[
task
]
=
1
+
task_order
[
group
]
task_order
[
task
]
=
1
+
task_order
[
group
]
print
(
"task_order"
)
for
task_name
,
task
in
task_dict
.
items
():
print
(
task_order
)
if
type
(
task
)
==
tuple
:
print
(
"--"
)
group_name
,
task
=
task
for
task_or_group
,
order
in
task_order
.
items
():
order
=
task_order
[
group_name
]
tabbed_name
=
">"
*
order
+
task_or_group
tabbed_name
=
"-"
*
order
+
group_name
results_agg
[
tabbed_name
]
=
results
[
task_or_group
]
results_agg
[
tabbed_name
]
=
results
[
group_name
]
versions
[
tabbed_name
]
=
versions
[
task_or_group
]
versions
[
tabbed_name
]
=
versions
[
group_name
]
if
(
order
==
0
)
and
len
(
task_hierarchy
[
task_or_group
])
>
0
:
if
order
==
0
:
groups_agg
[
task_or_group
]
=
results
[
task_or_group
]
groups_agg
[
group_name
]
=
results
[
group_name
]
order
=
task_order
[
task_name
]
tabbed_name
=
"-"
*
order
+
task_name
results_agg
[
tabbed_name
]
=
results
[
task_name
]
versions
[
tabbed_name
]
=
versions
[
task_name
]
results_dict
=
{
results_dict
=
{
"results"
:
dict
(
results_agg
.
items
()),
"results"
:
dict
(
results_agg
.
items
()),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment