Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
a005aeba
Unverified
Commit
a005aeba
authored
Aug 24, 2023
by
Lintang Sutawika
Committed by
GitHub
Aug 24, 2023
Browse files
Merge pull request #802 from EleutherAI/fix-metrics
Merge Fix metrics branch
parents
c01d5bac
f5d0f8e6
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
9 additions
and
9 deletions
+9
-9
ignore.txt
ignore.txt
+1
-1
lm_eval/api/task.py
lm_eval/api/task.py
+3
-3
lm_eval/evaluator.py
lm_eval/evaluator.py
+5
-5
No files found.
ignore.txt
View file @
a005aeba
lm_eval/api/task.py
View file @
a005aeba
...
@@ -659,14 +659,14 @@ class ConfigurableTask(Task):
...
@@ -659,14 +659,14 @@ class ConfigurableTask(Task):
self
.
multiple_target
=
len
(
test_target
)
self
.
multiple_target
=
len
(
test_target
)
else
:
else
:
if
(
type
(
test_target
)
is
int
)
and
(
test_choice
is
not
None
):
if
(
type
(
test_target
)
is
int
)
and
(
test_choice
is
not
None
):
test_target
=
[
self
.
doc_to_choice
(
test_target
)
[
test_target
]
]
test_target
=
test_choice
[
test_target
]
else
:
else
:
test_target
=
[
test_target
]
test_target
=
str
(
test_target
)
if
test_choice
is
not
None
:
if
test_choice
is
not
None
:
check_choices
=
test_choice
check_choices
=
test_choice
else
:
else
:
check_choices
=
test_target
check_choices
=
[
test_target
]
for
choice
in
check_choices
:
for
choice
in
check_choices
:
choice_has_whitespace
=
True
if
" "
in
choice
else
False
choice_has_whitespace
=
True
if
" "
in
choice
else
False
...
...
lm_eval/evaluator.py
View file @
a005aeba
...
@@ -219,7 +219,6 @@ def evaluate(
...
@@ -219,7 +219,6 @@ def evaluate(
padding_requests
=
collections
.
defaultdict
(
int
)
padding_requests
=
collections
.
defaultdict
(
int
)
# Stores group related keys and values for group-aggregation
# Stores group related keys and values for group-aggregation
aggregate
=
collections
.
defaultdict
(
dict
)
task_groups
=
collections
.
defaultdict
(
dict
)
task_groups
=
collections
.
defaultdict
(
dict
)
# get lists of each type of request
# get lists of each type of request
...
@@ -228,6 +227,7 @@ def evaluate(
...
@@ -228,6 +227,7 @@ def evaluate(
if
type
(
task
)
==
tuple
:
if
type
(
task
)
==
tuple
:
group
,
task
=
task
group
,
task
=
task
task_groups
[
task_name
]
=
group
task_groups
[
task_name
]
=
group
aggregate
[
task_name
]
=
{}
versions
[
task_name
]
=
task
.
VERSION
versions
[
task_name
]
=
task
.
VERSION
configs
[
task_name
]
=
dict
(
task
.
dump_config
())
configs
[
task_name
]
=
dict
(
task
.
dump_config
())
...
@@ -407,12 +407,12 @@ def evaluate(
...
@@ -407,12 +407,12 @@ def evaluate(
# | word_perplexity
# | word_perplexity
# | byte_perplexity
# | byte_perplexity
# | bits_per_byte
# | bits_per_byte
if
bool
(
task_groups
)
:
if
task_name
in
task_groups
:
group_name
=
task_groups
[
task_name
]
group_name
=
task_groups
[
task_name
]
if
metric
not
in
aggregate
[
group_name
]:
if
metric
in
list
(
aggregate
[
group_name
].
keys
()):
aggregate
[
group_name
][
metric
]
=
[
task_score
]
else
:
aggregate
[
group_name
][
metric
].
append
(
task_score
)
aggregate
[
group_name
][
metric
].
append
(
task_score
)
else
:
aggregate
[
group_name
][
metric
]
=
[
task_score
]
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
# so we run them less iterations. still looking for a cleaner way to do this
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment