Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
f5bdefe8
Commit
f5bdefe8
authored
Nov 01, 2023
by
lintangsutawika
Browse files
new way to display tasks
parent
a5d33ebe
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
47 additions
and
7 deletions
+47
-7
lm_eval/evaluator.py
lm_eval/evaluator.py
+47
-7
No files found.
lm_eval/evaluator.py
View file @
f5bdefe8
...
@@ -221,6 +221,7 @@ def evaluate(
...
@@ -221,6 +221,7 @@ def evaluate(
task_hierarchy
=
collections
.
defaultdict
(
list
)
task_hierarchy
=
collections
.
defaultdict
(
list
)
# store the ordering of tasks and groups
# store the ordering of tasks and groups
task_order
=
collections
.
defaultdict
(
int
)
task_order
=
collections
.
defaultdict
(
int
)
task_group_alias
=
collections
.
defaultdict
(
dict
)
# get lists of each type of request
# get lists of each type of request
for
task_name
,
task
in
task_dict
.
items
():
for
task_name
,
task
in
task_dict
.
items
():
...
@@ -228,6 +229,10 @@ def evaluate(
...
@@ -228,6 +229,10 @@ def evaluate(
group_name
,
task
=
task
group_name
,
task
=
task
task_hierarchy
[
group_name
].
append
(
task_name
)
task_hierarchy
[
group_name
].
append
(
task_name
)
versions
[
group_name
]
=
"N/A"
versions
[
group_name
]
=
"N/A"
if
"group_alias"
in
configs
[
task_name
]:
task_group_alias
[
group_name
]
=
configs
[
task_name
][
"group_alias"
]
else
:
else
:
task_hierarchy
[
task_name
]
=
[]
task_hierarchy
[
task_name
]
=
[]
...
@@ -237,6 +242,9 @@ def evaluate(
...
@@ -237,6 +242,9 @@ def evaluate(
versions
[
task_name
]
=
task
.
VERSION
versions
[
task_name
]
=
task
.
VERSION
configs
[
task_name
]
=
dict
(
task
.
dump_config
())
configs
[
task_name
]
=
dict
(
task
.
dump_config
())
if
"task_alias"
in
configs
[
task_name
]:
task_group_alias
[
task_name
]
=
configs
[
task_name
][
"task_alias"
]
if
limit
is
not
None
:
if
limit
is
not
None
:
if
task
.
has_test_docs
():
if
task
.
has_test_docs
():
task_docs
=
task
.
test_docs
()
task_docs
=
task
.
test_docs
()
...
@@ -522,19 +530,19 @@ def evaluate(
...
@@ -522,19 +530,19 @@ def evaluate(
results
[
group
][
"samples"
]
=
total_size
results
[
group
][
"samples"
]
=
total_size
def
print_tasks
(
task_hierarchy
,
task_order
,
task_version
):
def
print_tasks
(
task_hierarchy
,
task_order
,
task_version
,
task_group_alias
):
results_agg
=
collections
.
defaultdict
(
dict
)
results_agg
=
collections
.
defaultdict
(
dict
)
groups_agg
=
collections
.
defaultdict
(
dict
)
groups_agg
=
collections
.
defaultdict
(
dict
)
for
group_name
,
task_list
in
task_hierarchy
.
items
():
for
group_name
,
task_list
in
task_hierarchy
.
items
():
order
=
task_order
[
group_name
]
order
=
task_order
[
group_name
]
tabbed_name
=
"-"
*
order
+
group_name
results_agg
[
group_name
]
=
results
[
group_name
]
results_agg
[
tabbed_name
]
=
results
[
group_name
]
results_agg
[
group_name
][
"tab"
]
=
order
task_version
[
tabbed_name
]
=
task_version
[
group_name
]
if
(
order
<
max
(
task_order
.
values
()))
and
(
len
(
task_list
)
>
0
):
if
(
order
<
max
(
task_order
.
values
()))
and
(
len
(
task_list
)
>
0
):
groups_agg
[
tabbed_name
]
=
results
[
group_name
]
groups_agg
[
group_name
]
=
results
[
group_name
]
groups_agg
[
group_name
][
"tab"
]
=
order
if
task_list
!=
[]:
if
task_list
!=
[]:
for
task
in
sorted
(
task_list
):
for
task
in
sorted
(
task_list
):
...
@@ -544,7 +552,7 @@ def evaluate(
...
@@ -544,7 +552,7 @@ def evaluate(
_task_hierarchy
=
{
task
:
[]}
_task_hierarchy
=
{
task
:
[]}
_results_agg
,
_groups_agg
,
task_version
=
print_tasks
(
_results_agg
,
_groups_agg
,
task_version
=
print_tasks
(
_task_hierarchy
,
task_order
,
task_version
_task_hierarchy
,
task_order
,
task_version
,
task_group_alias
)
)
results_agg
=
{
**
results_agg
,
**
_results_agg
}
results_agg
=
{
**
results_agg
,
**
_results_agg
}
...
@@ -553,9 +561,41 @@ def evaluate(
...
@@ -553,9 +561,41 @@ def evaluate(
return
results_agg
,
groups_agg
,
task_version
return
results_agg
,
groups_agg
,
task_version
results_agg
,
groups_agg
,
versions
=
print_tasks
(
results_agg
,
groups_agg
,
versions
=
print_tasks
(
task_hierarchy
,
task_order
,
versions
task_hierarchy
,
task_order
,
versions
,
task_group_alias
)
)
_results_agg
=
collections
.
defaultdict
(
dict
)
_versions
=
collections
.
defaultdict
(
dict
)
for
task
in
results_agg
:
task_results
=
results_agg
[
task
]
if
"tab"
in
task_results
:
tab
=
task_results
.
pop
(
"tab"
)
tab_string
=
" "
*
(
tab
-
1
)
+
"-"
if
tab
>
0
else
""
if
task
in
task_group_alias
:
task_alias
=
task_group_alias
[
task
]
_results_agg
[
tab_string
+
task_alias
]
=
task_results
_versions
[
tab_string
+
task_alias
]
=
versions
[
task
]
else
:
_results_agg
[
tab_string
+
task
]
=
task_results
_versions
[
tab_string
+
task
]
=
versions
[
task
]
results_agg
=
_results_agg
versions
=
_versions
_groups_agg
=
collections
.
defaultdict
(
dict
)
for
group
in
groups_agg
:
group_results
=
groups_agg
[
group
]
if
"tab"
in
group_results
:
tab
=
group_results
.
pop
(
"tab"
)
tab_string
=
" "
*
(
tab
-
1
)
+
"-"
if
tab
>
0
else
""
if
group
in
task_group_alias
:
group_alias
=
task_group_alias
[
group
]
_groups_agg
[
tab_string
+
group_alias
]
=
group_results
else
:
_groups_agg
[
tab_string
+
group
]
=
group_results
groups_agg
=
_groups_agg
results_dict
=
{
results_dict
=
{
"results"
:
dict
(
results_agg
.
items
()),
"results"
:
dict
(
results_agg
.
items
()),
**
({
"groups"
:
dict
(
groups_agg
.
items
())}
if
bool
(
groups_agg
)
else
{}),
**
({
"groups"
:
dict
(
groups_agg
.
items
())}
if
bool
(
groups_agg
)
else
{}),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment