support bleu score as a metric

8806eff5 · haileyschoelkopf · dbf2c083 · 8806eff5 · 8806eff5 · 8806eff5
Commit 8806eff5 authored Aug 11, 2023 by haileyschoelkopf
Hide whitespace changes
Inline Side-by-side

Showing with 71 additions and 49 deletions

lm_eval/api/metrics.py lm_eval/api/metrics.py +28 -18

lm_eval/api/task.py lm_eval/api/task.py +16 -11

lm_eval/evaluator.py lm_eval/evaluator.py +27 -20

No files found.
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -56,6 +56,24 @@ def matthews_corrcoef(items):
    return sklearn.metrics.matthews_corrcoef(golds, preds)
+@register_aggregation("bleu")
+def bleu(items):
+    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
+    for evaluating a generated sentence to a reference sentence. It counts matching
+    n-grams in the candidate translation to n-grams in the reference text, where
+    1-gram or unigram would be each token and a bigram comparison would be each
+    word pair. The comparison is made regardless of word order
+    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
+    Paper: https://www.aclweb.org/anthology/P02-1040/
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_bleu(preds, refs).score
 @register_metric(
    metric="acc",
    higher_is_better=True,
@@ -160,6 +178,16 @@ def f1_fn(items):  # This is a passthrough function
    return items
+@register_metric(
+    metric="bleu",
+    higher_is_better=True,
+    output_type="greedy_until",
+    aggregation="bleu",
+)
+def bleu_fn(items):  # This is a passthrough function
+    return items
 @register_metric(
    metric="acc_all",
    higher_is_better=True,
@@ -217,24 +245,6 @@ def weighted_mean(items):
    return sum(a) / sum(b)
-@register_metric(metric="bleu", higher_is_better=True, aggregation="mean")
-def bleu(items):
-    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
-    for evaluating a generated sentence to a reference sentence. It counts matching
-    n-grams in the candidate translation to n-grams in the reference text, where
-    1-gram or unigram would be each token and a bigram comparison would be each
-    word pair. The comparison is made regardless of word order
-    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
-    Paper: https://www.aclweb.org/anthology/P02-1040/
-    Higher is better
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_bleu(preds, refs).score
 @register_metric(metric="chrf", higher_is_better=True, aggregation="mean")
 def chrf(items):
    """chrF++ is a tool for automatic evaluation of machine translation output

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -999,11 +999,14 @@ class ConfigurableTask(Task):
                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
                    scores = []
                    for gold_option in gold:
-                        res = self._metric_fn_list[key](
+                        try:
-                            references=[gold_option],
+                            res = self._metric_fn_list[key](
-                            predictions=[result],
+                                references=[gold_option],
-                            **self._metric_fn_kwargs[key],
+                                predictions=[result],
-                        )
+                                **self._metric_fn_kwargs[key],
+                            )
+                        except TypeError:  # TODO: this is hacky and I don't want to do it
+                            result = self._metric_fn_list[key]([gold_option, result])
                        if isinstance(res, dict):
                            # TODO: this handles the case where HF evaluate returns a dict.
                            res = res[key]
@@ -1013,12 +1016,14 @@ class ConfigurableTask(Task):
                    else:
                        result = 0.0
                else:
-                    result = self._metric_fn_list[key](
+                    try:
-                        references=[gold],
+                        result = self._metric_fn_list[key](
-                        predictions=[result],
+                            references=[gold],
-                        **self._metric_fn_kwargs[key],
+                            predictions=[result],
-                    )
+                            **self._metric_fn_kwargs[key],
+                        )
+                    except TypeError:
+                        result = self._metric_fn_list[key]([gold, result])
                if isinstance(result, dict):
                    result_dict.update(result)
                else:

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -362,28 +362,35 @@ def evaluate(
            if type(items[0]) == tuple:
                numitem = len(items[0])
-            # distributed gather requires all ranks to have same dimensions
+            if isinstance(items[0], (str, list)):
-            # so we pad out with float32 min value
+                # handle the string case
-            pad_value = torch.finfo(torch.float32).min
+                gathered_items = [None] * lm.accelerator.num_processes
-            metrics_tensor = torch.tensor(items, device=lm.device)
+                torch.distributed.all_gather_object(gathered_items, items)
-            original_dtype = metrics_tensor.dtype  # store original dtype
-            torch_device_tensor = lm.accelerator.pad_across_processes(
-                metrics_tensor.to(torch.float32), pad_index=pad_value
-            )
-            gathered_item = lm.accelerator.gather(torch_device_tensor)
-            if numitem > 0:
+                gathered_item = list(itertools.chain.from_iterable(gathered_items))
-                gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
            else:
-                gathered_filtered = gathered_item[gathered_item != pad_value]
+                # distributed gather requires all ranks to have same dimensions
+                # so we pad out with float32 min value
+                pad_value = torch.finfo(torch.float32).min
+                metrics_tensor = torch.tensor(items, device=lm.device)
+                original_dtype = metrics_tensor.dtype  # store original dtype
+                torch_device_tensor = lm.accelerator.pad_across_processes(
+                    metrics_tensor.to(torch.float32), pad_index=pad_value
+                )
+                gathered_item = lm.accelerator.gather(torch_device_tensor)
-            gathered_item = (
+                if numitem > 0:
-                gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
+                    gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
-            )
+                else:
-            # reconvert if we were passed a tuple of values
+                    gathered_filtered = gathered_item[gathered_item != pad_value]
-            if numitem > 0:
-                gathered_item = [tuple(g) for g in gathered_item]
+                gathered_item = (
+                    gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
+                )
+                # reconvert if we were passed a tuple of values
+                if numitem > 0:
+                    gathered_item = [tuple(g) for g in gathered_item]
            if lm.rank == 0:
                vals_torch[(task_name, key, metric)] = gathered_item
@@ -415,7 +422,7 @@ def evaluate(
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
-            if bootstrap_iters > 0:
+            if False:  # bootstrap_iters > 0:
                stderr = lm_eval.api.metrics.stderr_for_metric(
                    metric=task.aggregation()[metric],
                    bootstrap_iters=min(bootstrap_iters, 1000)