Commit 8806eff5 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

support bleu score as a metric

parent dbf2c083
......@@ -56,6 +56,24 @@ def matthews_corrcoef(items):
return sklearn.metrics.matthews_corrcoef(golds, preds)
@register_aggregation("bleu")
def bleu(items):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
for evaluating a generated sentence to a reference sentence. It counts matching
n-grams in the candidate translation to n-grams in the reference text, where
1-gram or unigram would be each token and a bigram comparison would be each
word pair. The comparison is made regardless of word order
Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
Paper: https://www.aclweb.org/anthology/P02-1040/
Higher is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
refs, preds = _sacreformat(refs, preds)
return sacrebleu.corpus_bleu(preds, refs).score
@register_metric(
metric="acc",
higher_is_better=True,
......@@ -160,6 +178,16 @@ def f1_fn(items): # This is a passthrough function
return items
@register_metric(
metric="bleu",
higher_is_better=True,
output_type="greedy_until",
aggregation="bleu",
)
def bleu_fn(items): # This is a passthrough function
return items
@register_metric(
metric="acc_all",
higher_is_better=True,
......@@ -217,24 +245,6 @@ def weighted_mean(items):
return sum(a) / sum(b)
@register_metric(metric="bleu", higher_is_better=True, aggregation="mean")
def bleu(items):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
for evaluating a generated sentence to a reference sentence. It counts matching
n-grams in the candidate translation to n-grams in the reference text, where
1-gram or unigram would be each token and a bigram comparison would be each
word pair. The comparison is made regardless of word order
Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
Paper: https://www.aclweb.org/anthology/P02-1040/
Higher is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
refs, preds = _sacreformat(refs, preds)
return sacrebleu.corpus_bleu(preds, refs).score
@register_metric(metric="chrf", higher_is_better=True, aggregation="mean")
def chrf(items):
"""chrF++ is a tool for automatic evaluation of machine translation output
......
......@@ -999,11 +999,14 @@ class ConfigurableTask(Task):
# TODO: this may break for multipLe_target, non zero-or-1 metrics
scores = []
for gold_option in gold:
res = self._metric_fn_list[key](
references=[gold_option],
predictions=[result],
**self._metric_fn_kwargs[key],
)
try:
res = self._metric_fn_list[key](
references=[gold_option],
predictions=[result],
**self._metric_fn_kwargs[key],
)
except TypeError: # TODO: this is hacky and I don't want to do it
result = self._metric_fn_list[key]([gold_option, result])
if isinstance(res, dict):
# TODO: this handles the case where HF evaluate returns a dict.
res = res[key]
......@@ -1013,12 +1016,14 @@ class ConfigurableTask(Task):
else:
result = 0.0
else:
result = self._metric_fn_list[key](
references=[gold],
predictions=[result],
**self._metric_fn_kwargs[key],
)
try:
result = self._metric_fn_list[key](
references=[gold],
predictions=[result],
**self._metric_fn_kwargs[key],
)
except TypeError:
result = self._metric_fn_list[key]([gold, result])
if isinstance(result, dict):
result_dict.update(result)
else:
......
......@@ -362,28 +362,35 @@ def evaluate(
if type(items[0]) == tuple:
numitem = len(items[0])
# distributed gather requires all ranks to have same dimensions
# so we pad out with float32 min value
pad_value = torch.finfo(torch.float32).min
metrics_tensor = torch.tensor(items, device=lm.device)
original_dtype = metrics_tensor.dtype # store original dtype
torch_device_tensor = lm.accelerator.pad_across_processes(
metrics_tensor.to(torch.float32), pad_index=pad_value
)
gathered_item = lm.accelerator.gather(torch_device_tensor)
if isinstance(items[0], (str, list)):
# handle the string case
gathered_items = [None] * lm.accelerator.num_processes
torch.distributed.all_gather_object(gathered_items, items)
if numitem > 0:
gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
gathered_item = list(itertools.chain.from_iterable(gathered_items))
else:
gathered_filtered = gathered_item[gathered_item != pad_value]
# distributed gather requires all ranks to have same dimensions
# so we pad out with float32 min value
pad_value = torch.finfo(torch.float32).min
metrics_tensor = torch.tensor(items, device=lm.device)
original_dtype = metrics_tensor.dtype # store original dtype
torch_device_tensor = lm.accelerator.pad_across_processes(
metrics_tensor.to(torch.float32), pad_index=pad_value
)
gathered_item = lm.accelerator.gather(torch_device_tensor)
gathered_item = (
gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
)
# reconvert if we were passed a tuple of values
if numitem > 0:
gathered_item = [tuple(g) for g in gathered_item]
if numitem > 0:
gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
else:
gathered_filtered = gathered_item[gathered_item != pad_value]
gathered_item = (
gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
)
# reconvert if we were passed a tuple of values
if numitem > 0:
gathered_item = [tuple(g) for g in gathered_item]
if lm.rank == 0:
vals_torch[(task_name, key, metric)] = gathered_item
......@@ -415,7 +422,7 @@ def evaluate(
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
if bootstrap_iters > 0:
if False: # bootstrap_iters > 0:
stderr = lm_eval.api.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment