CommitmentBank framework refactor

68a8790c · thefazzer · e5d0229f · 68a8790c · 68a8790c · 68a8790c
Commit 68a8790c authored Jan 10, 2021 by thefazzer
Showing with 38 additions and 22 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +1 -1

lm_eval/tasks/common.py lm_eval/tasks/common.py +8 -0

lm_eval/tasks/superglue.py lm_eval/tasks/superglue.py +29 -21

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -27,7 +27,7 @@ TASK_REGISTRY = {
    "wnli": glue.WNLI,
    # SuperGLUE
    "boolq": superglue.BoolQ,
-    "commitmentbank": superglue.CommitmentBank,
+    "cb": superglue.CommitmentBank,
    "copa": superglue.Copa,
    "multirc": superglue.MultiRC,
    "wic": superglue.WordsInContext,

--- a/lm_eval/tasks/common.py
+++ b/lm_eval/tasks/common.py
@@ -56,3 +56,11 @@ def yesno(x):
        return 'yes'
    else:
        return 'no'
+def trueneitherfalse(x):
+    if 0:
+        return 'true'
+    elif 1:
+        return 'neither'
+    elif 2:
+        return 'false'
\ No newline at end of file
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -2,7 +2,7 @@
 import numpy as np
 from tqdm import auto as tqdm_lib
-from . common import HFTask, simple_accuracy_metric, yesno
+from . common import HFTask, simple_accuracy_metric, yesno, trueneitherfalse
 from lm_eval.base import rf, mean
 class BoolQ(HFTask):
@@ -69,6 +69,9 @@ class CommitmentBank(HFTask):
    def has_test_docs(self):
        return True
+    def fewshot_description(self):
+        return "Given a premise and a hypothesis, classify whether the author of the premise is committed to the truth of the hypothesis. The three possible labels are true, false or neither."
    def doc_to_text(self, doc, include_target=True):
        text = "{}\nquestion:\t{}\ttrue, false or neither?\nanswer:".format(
            doc["premise"],
@@ -80,29 +83,34 @@ class CommitmentBank(HFTask):
            # Neither = neutral
            text += " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]])
        return text
+    def doc_to_target(self, doc):
+        return trueneitherfalse(doc['label']) 
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    def construct_requests(self, doc, ctx):
-        # TODO: Implement evaluation code using new framework
+        ll_true, _ = rf.loglikelihood(ctx, ' true')
+        ll_neither, _ = rf.loglikelihood(ctx, ' neither')
+        ll_false, _ = rf.loglikelihood(ctx, ' false')
-        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        return ll_true, ll_neither, ll_false
-        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-        # Remove this comment when the evaluation code is implemented.
-        golds = [doc["label"] for doc in docs]
-        preds = []
-        for doc in tqdm_lib.tqdm(docs):
-            ctx = self.fewshot_context(
-                doc=doc,
-                provide_description=provide_description,
-                num_fewshot=num_fewshot,
-            )
-            probs = np.array([
-                lm.loglikelihood(ctx, ' true'),
-                lm.loglikelihood(ctx, ' neither'),
-                lm.loglikelihood(ctx, ' false'),
-            ])
-            preds.append(np.argmax(probs))
-        return simple_accuracy_metric(preds=preds, golds=golds)
+    def process_results(self, doc, results):
+        gold = doc["label"]
+        acc = 1. if (np.argmax(results)) == gold else 0.
+        return {
+            "acc": acc
+        }
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
+    def aggregation(self):
+        return {
+            "acc": mean
+        }
 class Copa(HFTask):
    DATASET_PATH = "super_glue"