make chrf and ter aggregations

116c540a · haileyschoelkopf · 8806eff5 · 116c540a
Commit 116c540a authored Aug 12, 2023 by haileyschoelkopf
Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 31 deletions

lm_eval/api/metrics.py lm_eval/api/metrics.py +51 -31

No files found.
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -74,6 +74,37 @@ def bleu(items):
    return sacrebleu.corpus_bleu(preds, refs).score


+@register_aggregation("chrf")
+def chrf(items):
+    """chrF++ is a tool for automatic evaluation of machine translation output
+    based on character n-gram precision and recall enhanced with word n-grams.
+    Source: https://github.com/m-popovic/chrF
+    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
+
+    Higher is better  # TODO I think
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_chrf(preds, refs).score
+
+
+@register_aggregation("ter")
+def ter(items):
+    """Translation Error Rate is an error metric for machine translation that
+    measures the number of edits required to change a system output into one
+    of the references
+    Source: http://www.cs.umd.edu/~snover/tercom/
+    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
+
+    Lower is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_ter(preds, refs).score
+
+
 @register_metric(
    metric="acc",
    higher_is_better=True,
@@ -188,6 +219,26 @@ def bleu_fn(items):  # This is a passthrough function
    return items


+@register_metric(
+    metric="chrf",
+    higher_is_better=True,
+    output_type="greedy_until",
+    aggregation="chrf",
+)
+def chrf_fn(items):  # This is a passthrough function
+    return items
+
+
+@register_metric(
+    metric="ter",
+    higher_is_better=True,
+    output_type="greedy_until",
+    aggregation="ter",
+)
+def ter_fn(items):  # This is a passthrough function
+    return items
+
+
 @register_metric(
    metric="acc_all",
    higher_is_better=True,
@@ -245,37 +296,6 @@ def weighted_mean(items):
    return sum(a) / sum(b)


-@register_metric(metric="chrf", higher_is_better=True, aggregation="mean")
-def chrf(items):
-    """chrF++ is a tool for automatic evaluation of machine translation output
-    based on character n-gram precision and recall enhanced with word n-grams.
-    Source: https://github.com/m-popovic/chrF
-    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
-
-    Higher is better  # TODO I think
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_chrf(preds, refs).score
-
-
-@register_metric(metric="ter", higher_is_better=True, aggregation="mean")
-def ter(items):
-    """Translation Error Rate is an error metric for machine translation that
-    measures the number of edits required to change a system output into one
-    of the references
-    Source: http://www.cs.umd.edu/~snover/tercom/
-    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
-
-    Lower is better
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_ter(preds, refs).score
-
-
 def is_non_str_iterable(obj):
    return isinstance(obj, Iterable) and not isinstance(obj, str)