Added default behavior for bleu to the promtsourcetask class

d4c00093 · cjlovering · f39c27c2 · d4c00093
Commit d4c00093 authored Apr 26, 2022 by cjlovering
Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 9 deletions

lm_eval/base.py lm_eval/base.py +51 -9

No files found.
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -14,6 +14,7 @@ from tqdm import tqdm
 import torch
 import torch.nn.functional as F
+from lm_eval import metrics
 from lm_eval.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
 from lm_eval import utils
 from abc import abstractmethod
@@ -637,6 +638,16 @@ class Task(abc.ABC):
 class PromptSourceTask(Task):
+    """These are the metrics from promptsource that we have
+    added default behavior for. If you want to add default behavior for a new metric,
+    update the functions below. If you want to use one of the following metrics,
+    *and* add additional custom processing, override `process_results`, `higher_is_better`, and `aggregation`.
+    WARNING: ROUGE is WIP.
+    """
+    CONFIGURED_PS_METRICS = set(["Accuracy", "BLEU", "ROUGE"])
    def __init__(self, data_dir=None, cache_dir=None, download_mode=None, prompt=None):
        super().__init__(data_dir, cache_dir, download_mode)
        self.prompt = prompt
@@ -737,29 +748,60 @@ class PromptSourceTask(Task):
            ), f"We expect this to be a ranked choice task; double check please."
            pred = answer_choices_list[np.argmax(results)]
            out = {}
-            if "Accuracy" in self.prompt.metadata.metrics:
-                out["acc"] = pred == target
+            for metric in self.prompt.metadata.metrics:
+                assert (
+                    metric in self.CONFIGURED_PS_METRICS
+                ), "Unexpected metric. Add it, or use a task-specific solution."
+                if metric == "Accuracy":
+                    out["acc"] = pred == target
            # TODO: Add metrics here.
            return out
        else:
-            raise NotImplementedError("Generation is not implemented yet.")
+            # NOTE: In the future, target may be a list, not a string.
+            pred = results[0].strip()
+            out = {}
+            for metric in self.prompt.metadata.metrics:
+                assert (
+                    metric in self.CONFIGURED_PS_METRICS
+                ), "Unexpected metric. Add it, or use a task-specific solution."
+                if metric == "BLEU":
+                    out["bleu"] = (target, pred)
+                if metric == "ROUGE":
+                    print("WARNING: Skipping Rouge.")
+            return out
        # Map metric name to HF metric.
        # TODO(Albert): What is Other?
        # metric_names = prompt.metadata.metrics
    def higher_is_better(self):
        out = {}
-        if "Accuracy" in self.prompt.metadata.metrics:
+        for metric in self.prompt.metadata.metrics:
-            out["acc"] = True
+            assert (
+                metric in self.CONFIGURED_PS_METRICS
+            ), "Unexpected metric. Add it, or use a task-specific solution."
+            if metric == "Accuracy":
+                out["acc"] = True
+            if metric == "BLEU":
+                out["bleu"] = True
+            if metric == "ROUGE":
+                print("WARNING: Skipping Rouge.")
        return out
    def aggregation(self):
        out = {}
-        if "Accuracy" in self.prompt.metadata.metrics:
+        for metric in self.prompt.metadata.metrics:
-            out["acc"] = mean
+            assert (
+                metric in self.CONFIGURED_PS_METRICS
+            ), "Unexpected metric. Add it, or use a task-specific solution."
+            if metric == "Accuracy":
+                out["acc"] = mean
+            if metric == "BLEU":
+                out["bleu"] = metrics.bleu
+            if metric == "ROUGE":
+                print("WARNING: Skipping Rouge.")
        return out