Adopt new framework for `glue`

36467c0e · Jonathan Tow · 4c8d22db · 36467c0e · 36467c0e
Commit 36467c0e authored Jan 21, 2021 by Jonathan Tow
Show whitespace changes
Inline Side-by-side

Showing with 264 additions and 231 deletions

lm_eval/base.py lm_eval/base.py +11 -7

lm_eval/tasks/glue.py lm_eval/tasks/glue.py +253 -224

No files found.
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
 import abc
 import random
-import collections
 import numpy as np
-from sklearn.metrics import precision_recall_fscore_support as score
+import sklearn
 class LM(abc.ABC):
    @abc.abstractmethod
@@ -177,15 +176,23 @@ class Dataset(abc.ABC):
        return description + labeled_examples + example
 def mean(arr):
    return sum(arr) / len(arr)
+def median(arr):
+    return arr[len(arr) // 2]
+def matthews_corrcoef(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    return sklearn.metrics.matthews_corrcoef(golds, preds)
 def f1_score(items):
    unzipped_list = list(zip(*items))
    golds = unzipped_list[0]
    preds = unzipped_list[1]
-    precision, recall, fscore, support = score(golds, preds)
+    fscore = sklearn.metrics.f1_score(golds, preds)
    return max(fscore)
 def acc_all(items):
@@ -205,9 +212,6 @@ def acc_all(items):
    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
    return acc
-def median(arr):
-    return arr[len(arr) // 2]
 req_ret_lens = {
    'loglikelihood': 2
 }

--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
 # REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
 import numpy as np
+from lm_eval.base import rf, mean, f1_score, matthews_corrcoef
 from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import f1_score, matthews_corrcoef
 from tqdm import auto as tqdm_lib
-from . common import HFTask, simple_accuracy_metric, yesno
+from . common import HFTask, yesno
-def get_accuracy_and_f1(preds, golds):
-    golds = np.array(golds)
+# Single-Sentence Tasks
-    preds = np.array(preds)
-    acc = float((preds == golds).mean())
-    f1 = float(f1_score(y_true=golds, y_pred=preds))
-    minor = {
-        "acc": acc,
-        "f1": f1,
-        "acc_and_f1": (acc + f1) / 2,
-    }
-    return {
-        "major": minor["acc_and_f1"],
-        "minor": minor,
-        "higher_is_better": True,
-    }
 class CoLA(HFTask):
@@ -45,31 +32,80 @@ class CoLA(HFTask):
    def doc_to_target(self, doc):
        return " {}".format({1: "True", 0: "False"}[doc["label"]])
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    def construct_requests(self, doc, ctx):
+        ll_true, _ = rf.loglikelihood(ctx, " True")
+        ll_false, _ = rf.loglikelihood(ctx, " False")
+        return ll_true, ll_false
-        # TODO: Implement evaluation code using new framework
+    def process_results(self, doc, results):
+        ll_true, ll_false = results
+        pred = ll_true > ll_false
+        gold = doc["label"]
+        return {
+            "mcc": (gold, pred)
+        }
-        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+    def higher_is_better(self):
-        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        return {
-        # Remove this comment when the evaluation code is implemented.
+            "mcc": True
-        golds = [doc["label"] for doc in docs]
+        }
-        preds = []
-        for doc in tqdm_lib.tqdm(docs):
+    def aggregation(self):
-            ctx = self.fewshot_context(
+        return {
-                doc=doc,
+            "mcc": matthews_corrcoef
-                provide_description=provide_description,
+        }
-                num_fewshot=num_fewshot,
+class SST(HFTask):
+    DATASET_PATH = "glue"
+    DATASET_NAME = "sst2"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def fewshot_description(self):
+        return "Indicate if each sentence is Positive or Negative."
+    def doc_to_text(self, doc):
+        return "sentence:\t{}\t\nanswer:".format(
+            doc["sentence"],
        )
-            preds.append(lm.loglikelihood(ctx, ' True') > lm.loglikelihood(ctx, ' False'))
-        golds = np.array(golds)
+    def doc_to_target(self, doc):
-        preds = np.array(preds)
+        return " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
-        mcc = float(matthews_corrcoef(y_true=golds, y_pred=preds))
+    def construct_requests(self, doc, ctx):
+        ll_positive, _ = rf.loglikelihood(ctx, " Positive")
+        ll_negative, _ = rf.loglikelihood(ctx, " Negative")
+        return ll_positive, ll_negative
+    def process_results(self, doc, results):
+        ll_positive, ll_negative = results
+        pred = ll_positive > ll_negative
+        gold = doc["label"]
        return {
-            "major": mcc,
+            "acc": pred == gold
-            "minor": {"mcc": mcc},
+        }
-            "higher_is_better": True,
+    def higher_is_better(self):
+        return {
+            "acc": True
        }
+    def aggregation(self):
+        return {
+            "acc": mean
+        }
+# Inference Tasks
 class MNLI(HFTask):
    DATASET_PATH = "glue"
@@ -104,27 +140,28 @@ class MNLI(HFTask):
        # Neither = neutral
        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    def construct_requests(self, doc, ctx):
-        # TODO: Implement evaluation code using new framework
+        ll_true, _ = rf.loglikelihood(ctx, " True")
+        ll_neither, _ = rf.loglikelihood(ctx, " Neither")
+        ll_false, _ = rf.loglikelihood(ctx, " False")
+        return ll_true, ll_neither, ll_false
-        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+    def process_results(self, doc, results):
-        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        gold = doc["label"]
-        # Remove this comment when the evaluation code is implemented.
+        pred = np.argmax(results)
-        golds = [doc["label"] for doc in docs]
+        return {
-        preds = []
+            "acc": pred == gold
-        for doc in tqdm_lib.tqdm(docs):
+        }
-            ctx = self.fewshot_context(
-                doc=doc,
+    def higher_is_better(self):
-                provide_description=provide_description,
+        return {
-                num_fewshot=num_fewshot,
+            "acc": True
-            )
+        }
-            probs = np.array([
-                lm.loglikelihood(ctx, ' True'),
+    def aggregation(self):
-                lm.loglikelihood(ctx, ' Neither'),
+        return {
-                lm.loglikelihood(ctx, ' False'),
+            "acc": mean
-            ])
+        }
-            preds.append(np.argmax(probs))
-        return simple_accuracy_metric(preds=preds, golds=golds)
 class MNLIMismatched(MNLI):
@@ -138,9 +175,9 @@ class MNLIMismatched(MNLI):
            return self.data["test_mismatched"]
-class MRPC(HFTask):
+class QNLI(HFTask):
    DATASET_PATH = "glue"
-    DATASET_NAME = "mrpc"
+    DATASET_NAME = "qnli"
    def has_training_docs(self):
        return True
@@ -151,34 +188,88 @@ class MRPC(HFTask):
    def has_test_docs(self):
        return True
-    def fewshot_description(self):
+    def doc_to_text(self, doc):
-        return "Indicate if both sentences mean the same thing."
+        return "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format(
+            doc["question"],
+            doc["sentence"],
+        )
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = not entailment
+        return " {}".format({0: "Yes", 1: "No"}[doc["label"]])
+    def construct_requests(self, doc, ctx):
+        ll_yes, _ = rf.loglikelihood(ctx, " Yes")
+        ll_no, _ = rf.loglikelihood(ctx, " No")
+        return ll_yes, ll_no
+    def process_results(self, doc, results):
+        ll_yes, ll_no = results
+        pred = ll_no > ll_yes
+        gold = doc["label"]
+        return {
+            "acc": pred == gold
+        }
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
+    def aggregation(self):
+        return {
+            "acc": mean
+        }
+class WNLI(HFTask):
+    DATASET_PATH = "glue"
+    DATASET_NAME = "wnli"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
    def doc_to_text(self, doc):
-        return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
+        return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
    def doc_to_target(self, doc):
-        return " {}".format(yesno(doc["label"]))
+        # True = entailment
+        # False = contradiction
+        # Neither = neutral
+        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    def construct_requests(self, doc, ctx):
-        # TODO: Implement evaluation code using new framework
+        ll_true, _ = rf.loglikelihood(ctx, " True")
+        ll_neither, _ = rf.loglikelihood(ctx, " Neither")
+        ll_false, _ = rf.loglikelihood(ctx, " False")
+        return ll_true, ll_neither, ll_false
-        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+    def process_results(self, doc, results):
-        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        gold = doc["label"]
-        # Remove this comment when the evaluation code is implemented.
+        pred = np.argmax(results)
-        golds = [doc["label"] for doc in docs]
+        return {
-        preds = []
+            "acc": pred == gold
-        for doc in tqdm_lib.tqdm(docs):
+        }
-            ctx = self.fewshot_context(
-                doc=doc,
+    def higher_is_better(self):
-                provide_description=provide_description,
+        return {
-                num_fewshot=num_fewshot,
+            "acc": True
-            )
+        }
-            preds.append(lm.loglikelihood(ctx, 'yes') > lm.loglikelihood(ctx, 'no'))
-        return get_accuracy_and_f1(preds=preds, golds=golds)
+    def aggregation(self):
+        return {
+            "acc": mean
+        }
 class RTE(HFTask):
@@ -205,27 +296,36 @@ class RTE(HFTask):
        # 1 = not_entailment
        return " {}".format({0: "True", 1: "False"}[doc["label"]])
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    def construct_requests(self, doc, ctx):
-        # TODO: Implement evaluation code using new framework
+        ll_true, _ = rf.loglikelihood(ctx, " True")
+        ll_false, _ = rf.loglikelihood(ctx, " False")
+        return ll_true, ll_false
-        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+    def process_results(self, doc, results):
-        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        ll_true, ll_false = results
-        # Remove this comment when the evaluation code is implemented.
+        pred = ll_false > ll_true
-        golds = [doc["label"] for doc in docs]
+        gold = doc["label"]
-        preds = []
+        return {
-        for doc in tqdm_lib.tqdm(docs):
+            "acc": pred == gold
-            ctx = self.fewshot_context(
+        }
-                doc=doc,
-                provide_description=provide_description,
+    def higher_is_better(self):
-                num_fewshot=num_fewshot,
+        return {
-            )
+            "acc": True
-            preds.append(lm.loglikelihood(ctx, ' False') > lm.loglikelihood(ctx, ' True'))
+        }
-        return simple_accuracy_metric(preds=preds, golds=golds)
+    def aggregation(self):
+        return {
+            "acc": mean
+        }
-class QNLI(HFTask):
+# Similarity and Paraphrase Tasks
+class MRPC(HFTask):
    DATASET_PATH = "glue"
-    DATASET_NAME = "qnli"
+    DATASET_NAME = "mrpc"
    def has_training_docs(self):
        return True
@@ -236,33 +336,43 @@ class QNLI(HFTask):
    def has_test_docs(self):
        return True
+    def fewshot_description(self):
+        return "Indicate if both sentences mean the same thing."
    def doc_to_text(self, doc):
-        return "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format(
+        return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
-            doc["question"],
+            doc["sentence1"],
-            doc["sentence"],
+            doc["sentence2"],
        )
    def doc_to_target(self, doc):
-        # True = entailment
+        return " {}".format(yesno(doc["label"]))
-        # False = not entailment
-        return " {}".format({0: "Yes", 1: "No"}[doc["label"]])
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    def construct_requests(self, doc, ctx):
-        # TODO: Implement evaluation code using new framework
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
+        return ll_yes, ll_no
-        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+    def process_results(self, doc, results):
-        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        ll_yes, ll_no = results
-        # Remove this comment when the evaluation code is implemented.
+        gold = doc["label"]
-        golds = [doc["label"] for doc in docs]
+        pred = ll_yes > ll_no
-        preds = []
+        return {
-        for doc in tqdm_lib.tqdm(docs):
+            "acc": pred == gold,
-            ctx = self.fewshot_context(
+            "f1": (gold, pred),
-                doc=doc,
+        }
-                provide_description=provide_description,
-                num_fewshot=num_fewshot,
+    def higher_is_better(self):
-            )
+        return {
-            preds.append(lm.loglikelihood(ctx, ' False') > lm.loglikelihood(ctx, ' True'))
+            "acc": True,
-        return simple_accuracy_metric(preds=preds, golds=golds)
+            "f1": True
+        }
+    def aggregation(self):
+        return {
+            "acc": mean,
+            "f1": f1_score
+        }
 class QQP(HFTask):
@@ -290,22 +400,31 @@ class QQP(HFTask):
    def doc_to_target(self, doc):
        return " {}".format(yesno(doc["label"]))
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    def construct_requests(self, doc, ctx):
-        # TODO: Implement evaluation code using new framework
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
+        return ll_yes, ll_no
-        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+    def process_results(self, doc, results):
-        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        ll_yes, ll_no = results
-        # Remove this comment when the evaluation code is implemented.
+        gold = doc["label"]
-        golds = [doc["label"] for doc in docs]
+        pred = ll_yes > ll_no
-        preds = []
+        return {
-        for doc in tqdm_lib.tqdm(docs):
+            "acc": pred == gold,
-            ctx = self.fewshot_context(
+            "f1": (gold, pred),
-                doc=doc,
+        }
-                provide_description=provide_description,
-                num_fewshot=num_fewshot,
+    def higher_is_better(self):
-            )
+        return {
-            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
+            "acc": True,
-        return get_accuracy_and_f1(preds=preds, golds=golds)
+            "f1": True
+        }
+    def aggregation(self):
+        return {
+            "acc": mean,
+            "f1": f1_score
+        }
 class STSB(HFTask):
@@ -368,93 +487,3 @@ class STSB(HFTask):
            "minor": minor,
            "higher_is_better": True,
        }
-class SST(HFTask):
-    DATASET_PATH = "glue"
-    DATASET_NAME = "sst2"
-    def has_training_docs(self):
-        return True
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return True
-    def fewshot_description(self):
-        return "Indicate if each sentence is Positive or Negative."
-    def doc_to_text(self, doc):
-        return "sentence:\t{}\t\nanswer:".format(
-            doc["sentence"],
-        )
-    def doc_to_target(self, doc):
-        return " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: Implement evaluation code using new framework
-        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
-        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-        # Remove this comment when the evaluation code is implemented.
-        golds = [doc["label"] for doc in docs]
-        preds = []
-        for doc in tqdm_lib.tqdm(docs):
-            ctx = self.fewshot_context(
-                doc=doc,
-                provide_description=provide_description,
-                num_fewshot=num_fewshot,
-            )
-            preds.append(lm.loglikelihood(ctx, ' Positive') > lm.loglikelihood(ctx, ' Negative'))
-        return simple_accuracy_metric(preds=preds, golds=golds)
-class WNLI(HFTask):
-    DATASET_PATH = "glue"
-    DATASET_NAME = "wnli"
-    def has_training_docs(self):
-        return True
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return True
-    def doc_to_text(self, doc):
-        return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
-            doc["sentence1"],
-            doc["sentence2"],
-        )
-    def doc_to_target(self, doc):
-        # True = entailment
-        # False = contradiction
-        # Neither = neutral
-        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: Implement evaluation code using new framework
-        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
-        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-        # Remove this comment when the evaluation code is implemented.
-        golds = [doc["label"] for doc in docs]
-        preds = []
-        for doc in tqdm_lib.tqdm(docs):
-            ctx = self.fewshot_context(
-                doc=doc,
-                provide_description=provide_description,
-                num_fewshot=num_fewshot,
-            )
-            probs = np.array([
-                lm.loglikelihood(ctx, ' True'),
-                lm.loglikelihood(ctx, ' Neither'),
-                lm.loglikelihood(ctx, ' False'),
-            ])
-            preds.append(np.argmax(probs))
-        return simple_accuracy_metric(preds=preds, golds=golds)