Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
8806eff5
Commit
8806eff5
authored
Aug 11, 2023
by
haileyschoelkopf
Browse files
support bleu score as a metric
parent
dbf2c083
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
71 additions
and
49 deletions
+71
-49
lm_eval/api/metrics.py
lm_eval/api/metrics.py
+28
-18
lm_eval/api/task.py
lm_eval/api/task.py
+16
-11
lm_eval/evaluator.py
lm_eval/evaluator.py
+27
-20
No files found.
lm_eval/api/metrics.py
View file @
8806eff5
...
@@ -56,6 +56,24 @@ def matthews_corrcoef(items):
...
@@ -56,6 +56,24 @@ def matthews_corrcoef(items):
return
sklearn
.
metrics
.
matthews_corrcoef
(
golds
,
preds
)
return
sklearn
.
metrics
.
matthews_corrcoef
(
golds
,
preds
)
@
register_aggregation
(
"bleu"
)
def
bleu
(
items
):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
for evaluating a generated sentence to a reference sentence. It counts matching
n-grams in the candidate translation to n-grams in the reference text, where
1-gram or unigram would be each token and a bigram comparison would be each
word pair. The comparison is made regardless of word order
Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
Paper: https://www.aclweb.org/anthology/P02-1040/
Higher is better
"""
refs
=
list
(
zip
(
*
items
))[
0
]
preds
=
list
(
zip
(
*
items
))[
1
]
refs
,
preds
=
_sacreformat
(
refs
,
preds
)
return
sacrebleu
.
corpus_bleu
(
preds
,
refs
).
score
@
register_metric
(
@
register_metric
(
metric
=
"acc"
,
metric
=
"acc"
,
higher_is_better
=
True
,
higher_is_better
=
True
,
...
@@ -160,6 +178,16 @@ def f1_fn(items): # This is a passthrough function
...
@@ -160,6 +178,16 @@ def f1_fn(items): # This is a passthrough function
return
items
return
items
@
register_metric
(
metric
=
"bleu"
,
higher_is_better
=
True
,
output_type
=
"greedy_until"
,
aggregation
=
"bleu"
,
)
def
bleu_fn
(
items
):
# This is a passthrough function
return
items
@
register_metric
(
@
register_metric
(
metric
=
"acc_all"
,
metric
=
"acc_all"
,
higher_is_better
=
True
,
higher_is_better
=
True
,
...
@@ -217,24 +245,6 @@ def weighted_mean(items):
...
@@ -217,24 +245,6 @@ def weighted_mean(items):
return
sum
(
a
)
/
sum
(
b
)
return
sum
(
a
)
/
sum
(
b
)
@
register_metric
(
metric
=
"bleu"
,
higher_is_better
=
True
,
aggregation
=
"mean"
)
def
bleu
(
items
):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
for evaluating a generated sentence to a reference sentence. It counts matching
n-grams in the candidate translation to n-grams in the reference text, where
1-gram or unigram would be each token and a bigram comparison would be each
word pair. The comparison is made regardless of word order
Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
Paper: https://www.aclweb.org/anthology/P02-1040/
Higher is better
"""
refs
=
list
(
zip
(
*
items
))[
0
]
preds
=
list
(
zip
(
*
items
))[
1
]
refs
,
preds
=
_sacreformat
(
refs
,
preds
)
return
sacrebleu
.
corpus_bleu
(
preds
,
refs
).
score
@
register_metric
(
metric
=
"chrf"
,
higher_is_better
=
True
,
aggregation
=
"mean"
)
@
register_metric
(
metric
=
"chrf"
,
higher_is_better
=
True
,
aggregation
=
"mean"
)
def
chrf
(
items
):
def
chrf
(
items
):
"""chrF++ is a tool for automatic evaluation of machine translation output
"""chrF++ is a tool for automatic evaluation of machine translation output
...
...
lm_eval/api/task.py
View file @
8806eff5
...
@@ -999,11 +999,14 @@ class ConfigurableTask(Task):
...
@@ -999,11 +999,14 @@ class ConfigurableTask(Task):
# TODO: this may break for multipLe_target, non zero-or-1 metrics
# TODO: this may break for multipLe_target, non zero-or-1 metrics
scores
=
[]
scores
=
[]
for
gold_option
in
gold
:
for
gold_option
in
gold
:
res
=
self
.
_metric_fn_list
[
key
](
try
:
references
=
[
gold_option
],
res
=
self
.
_metric_fn_list
[
key
](
predictions
=
[
result
],
references
=
[
gold_option
],
**
self
.
_metric_fn_kwargs
[
key
],
predictions
=
[
result
],
)
**
self
.
_metric_fn_kwargs
[
key
],
)
except
TypeError
:
# TODO: this is hacky and I don't want to do it
result
=
self
.
_metric_fn_list
[
key
]([
gold_option
,
result
])
if
isinstance
(
res
,
dict
):
if
isinstance
(
res
,
dict
):
# TODO: this handles the case where HF evaluate returns a dict.
# TODO: this handles the case where HF evaluate returns a dict.
res
=
res
[
key
]
res
=
res
[
key
]
...
@@ -1013,12 +1016,14 @@ class ConfigurableTask(Task):
...
@@ -1013,12 +1016,14 @@ class ConfigurableTask(Task):
else
:
else
:
result
=
0.0
result
=
0.0
else
:
else
:
result
=
self
.
_metric_fn_list
[
key
](
try
:
references
=
[
gold
],
result
=
self
.
_metric_fn_list
[
key
](
predictions
=
[
result
],
references
=
[
gold
],
**
self
.
_metric_fn_kwargs
[
key
],
predictions
=
[
result
],
)
**
self
.
_metric_fn_kwargs
[
key
],
)
except
TypeError
:
result
=
self
.
_metric_fn_list
[
key
]([
gold
,
result
])
if
isinstance
(
result
,
dict
):
if
isinstance
(
result
,
dict
):
result_dict
.
update
(
result
)
result_dict
.
update
(
result
)
else
:
else
:
...
...
lm_eval/evaluator.py
View file @
8806eff5
...
@@ -362,28 +362,35 @@ def evaluate(
...
@@ -362,28 +362,35 @@ def evaluate(
if
type
(
items
[
0
])
==
tuple
:
if
type
(
items
[
0
])
==
tuple
:
numitem
=
len
(
items
[
0
])
numitem
=
len
(
items
[
0
])
# distributed gather requires all ranks to have same dimensions
if
isinstance
(
items
[
0
],
(
str
,
list
)):
# so we pad out with float32 min value
# handle the string case
pad_value
=
torch
.
finfo
(
torch
.
float32
).
min
gathered_items
=
[
None
]
*
lm
.
accelerator
.
num_processes
metrics_tensor
=
torch
.
tensor
(
items
,
device
=
lm
.
device
)
torch
.
distributed
.
all_gather_object
(
gathered_items
,
items
)
original_dtype
=
metrics_tensor
.
dtype
# store original dtype
torch_device_tensor
=
lm
.
accelerator
.
pad_across_processes
(
metrics_tensor
.
to
(
torch
.
float32
),
pad_index
=
pad_value
)
gathered_item
=
lm
.
accelerator
.
gather
(
torch_device_tensor
)
if
numitem
>
0
:
gathered_item
=
list
(
itertools
.
chain
.
from_iterable
(
gathered_items
))
gathered_filtered
=
gathered_item
[
gathered_item
[:,
0
]
!=
pad_value
]
else
:
else
:
gathered_filtered
=
gathered_item
[
gathered_item
!=
pad_value
]
# distributed gather requires all ranks to have same dimensions
# so we pad out with float32 min value
pad_value
=
torch
.
finfo
(
torch
.
float32
).
min
metrics_tensor
=
torch
.
tensor
(
items
,
device
=
lm
.
device
)
original_dtype
=
metrics_tensor
.
dtype
# store original dtype
torch_device_tensor
=
lm
.
accelerator
.
pad_across_processes
(
metrics_tensor
.
to
(
torch
.
float32
),
pad_index
=
pad_value
)
gathered_item
=
lm
.
accelerator
.
gather
(
torch_device_tensor
)
gathered_item
=
(
if
numitem
>
0
:
gathered_filtered
.
to
(
original_dtype
).
cpu
().
detach
().
numpy
().
tolist
()
gathered_filtered
=
gathered_item
[
gathered_item
[:,
0
]
!=
pad_value
]
)
else
:
# reconvert if we were passed a tuple of values
gathered_filtered
=
gathered_item
[
gathered_item
!=
pad_value
]
if
numitem
>
0
:
gathered_item
=
[
tuple
(
g
)
for
g
in
gathered_item
]
gathered_item
=
(
gathered_filtered
.
to
(
original_dtype
).
cpu
().
detach
().
numpy
().
tolist
()
)
# reconvert if we were passed a tuple of values
if
numitem
>
0
:
gathered_item
=
[
tuple
(
g
)
for
g
in
gathered_item
]
if
lm
.
rank
==
0
:
if
lm
.
rank
==
0
:
vals_torch
[(
task_name
,
key
,
metric
)]
=
gathered_item
vals_torch
[(
task_name
,
key
,
metric
)]
=
gathered_item
...
@@ -415,7 +422,7 @@ def evaluate(
...
@@ -415,7 +422,7 @@ def evaluate(
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
# so we run them less iterations. still looking for a cleaner way to do this
if
bootstrap_iters
>
0
:
if
False
:
#
bootstrap_iters > 0:
stderr
=
lm_eval
.
api
.
metrics
.
stderr_for_metric
(
stderr
=
lm_eval
.
api
.
metrics
.
stderr_for_metric
(
metric
=
task
.
aggregation
()[
metric
],
metric
=
task
.
aggregation
()[
metric
],
bootstrap_iters
=
min
(
bootstrap_iters
,
1000
)
bootstrap_iters
=
min
(
bootstrap_iters
,
1000
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment