Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
94218002
Commit
94218002
authored
Apr 26, 2022
by
jon-tow
Browse files
Add multi-reference ROUGE support
parent
4941a8bb
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
61 additions
and
0 deletions
+61
-0
lm_eval/metrics.py
lm_eval/metrics.py
+61
-0
No files found.
lm_eval/metrics.py
View file @
94218002
import
typing
import
math
from
collections.abc
import
Iterable
import
numpy
as
np
import
sacrebleu
from
rouge_score
import
rouge_scorer
import
sklearn.metrics
import
random
...
...
@@ -184,6 +186,65 @@ def _sacreformat(refs, preds):
return
refs
,
preds
def
rouge
(
refs
:
typing
.
List
[
str
],
pred
:
str
,
rouge_types
:
typing
.
List
[
str
]
=
[
"rouge1"
,
"rouge2"
,
"rougeL"
,
"rougeLsum"
]
):
""" ROUGE with multi-reference support
Implementation based on GEM-metrics:
https://github.com/GEM-benchmark/GEM-metrics/blob/431a8174bd6b3637e8d6118bfad2983e39e99733/gem_metrics/rouge.py
:param refs:
A `list` of reference `str`s.
:param pred:
A single prediction `str`s.
"""
scorer
=
rouge_scorer
.
RougeScorer
(
rouge_types
=
rouge_types
,
use_stemmer
=
True
)
# ROUGE multi-ref jackknifing
if
len
(
refs
)
>
1
:
cur_scores
=
[
scorer
.
score
(
ref
,
pred
)
for
ref
in
refs
]
# get best score for all leave-one-out sets
best_scores
=
[]
for
leave
in
range
(
len
(
refs
)):
cur_scores_leave_one
=
[
cur_scores
[
s
]
for
s
in
range
(
len
(
refs
))
if
s
!=
leave
]
best_scores
.
append
(
{
rouge_type
:
max
(
[
s
[
rouge_type
]
for
s
in
cur_scores_leave_one
],
key
=
lambda
s
:
s
.
fmeasure
,
)
for
rouge_type
in
rouge_types
}
)
# average the leave-one-out bests to produce the final score
score
=
{
rouge_type
:
rouge_scorer
.
scoring
.
Score
(
np
.
mean
([
b
[
rouge_type
].
precision
for
b
in
best_scores
]),
np
.
mean
([
b
[
rouge_type
].
recall
for
b
in
best_scores
]),
np
.
mean
([
b
[
rouge_type
].
fmeasure
for
b
in
best_scores
]),
)
for
rouge_type
in
rouge_types
}
else
:
score
=
scorer
.
score
(
refs
[
0
],
pred
)
# convert the named tuples to plain nested dicts
score
=
{
rouge_type
:
{
"precision"
:
score
[
rouge_type
].
precision
,
"recall"
:
score
[
rouge_type
].
recall
,
"fmeasure"
:
score
[
rouge_type
].
fmeasure
,
}
for
rouge_type
in
rouge_types
}
return
score
# stderr stuff
class
_bootstrap_internal
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment