glue tasks

cf80f340 · Jason Phang · c2aaa501 · cf80f340 · cf80f340 · cf80f340
Commit cf80f340 authored Sep 07, 2020 by Jason Phang
7 changed files
--- a/base.py
+++ b/base.py
@@ -103,14 +103,16 @@ class Dataset(abc.ABC):
        """
        pass

-    def fewshot_prefix(self):
+    def fewshot_description(self):
        return ""

-    def fewshot_context(self, doc, k):
-        prefix = self.fewshot_prefix()
-        labeled_examples = "\n\n".join([self.doc_to_text(doc) for doc in self.fewshot_examples(k)])
-        example = self.doc_to_text(doc, include_target=False)
-        return prefix + labeled_examples + example
+    def fewshot_context(self, doc, num_fewshot, provide_description):
+        description = (self.fewshot_description() + "\n\n") if provide_description else ""
+        labeled_examples = "\n\n".join(
+            map(self.doc_to_text, self.fewshot_examples(k=num_fewshot))
+        ) + "\n\n"
+        example = self.doc_to_text(doc, include_target=False).strip()
+        return description + labeled_examples + example


 class Registry:

--- a/hf.py
+++ b/hf.py
@@ -23,7 +23,7 @@ class BoolQ(base.Dataset):
    def test_docs(self):
        return []
    
-    def fewshot_description(self):
+    def fewshot_prefix(self):
        return "Read the following passages and answer each question with a yes or a no."

    def doc_to_text(self, doc, include_target=True):    

--- a/main.py
+++ b/main.py
-from models.gpt2 import GPT2LM
+import argparse
+import json

-lm = GPT2LM()
+import models
+import tasks

-print(lm.generate('1 + 1 = 2.\n3 + 5 = 8.\n4 + 9 = 13.\n4 + 3 = 7.\n2 + 3 =', '.'))
\ No newline at end of file
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', required=True)
+    parser.add_argument('--model_args', default="")
+    parser.add_argument('--tasks', default="all_tasks")
+    parser.add_argument('--provide_description', action="store_true")
+    parser.add_argument('--new_fewshot', action="store_true")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    model = models.get_model(args.model).create_from_arg_string(args.model_args)
+    if args.tasks == "all_tasks":
+        task_names = tasks.ALL_TASKS
+    else:
+        task_names = args.tasks.split(",")
+    task_list = {
+        task_name: tasks.get_task(task_name)()
+        for task_name in task_names
+    }
+    results = {}
+    for task_name, task in task_list:
+        if not task.has_validation_docs():
+            continue
+        result = task.evaluate(
+            docs=task.validation_docs(),
+            provide_description=args.provide_description,
+            num_fewshot=args.new_fewshot,
+        )
+        results[task_name] = result
+    print(json.dumps(results, indent=2))
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/tasks/__init__.py
+++ b/tasks/__init__.py
@@ -16,5 +16,8 @@ for file in os.listdir(tasks_dir):
        module = importlib.import_module('lm_evaluation_harness.tasks.' + module_name)


+ALL_TASKS = sorted(list(TASK_REGISTRY.registry))
+
+
 def get_task(model_name):
    return TASK_REGISTRY.registry[model_name]
--- a/tasks/common.py
+++ b/tasks/common.py
@@ -31,3 +31,10 @@ def simple_accuracy_metric(preds, golds):
        "minor": {"acc": acc},
        "higher_is_better": True,
    }
+
+
+def yesno(x):
+    if x:
+        return 'yes'
+    else:
+        return 'no'
--- a/tasks/glue.py
+++ b/tasks/glue.py
-import nlp
 import numpy as np
-import random
+from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import f1_score, matthews_corrcoef
-from . common import NLP_TASK, simple_accuracy_metric
+from tqdm import auto as tqdm_lib
+from . common import NLP_TASK, simple_accuracy_metric, yesno
 from . import TASK_REGISTRY


+def get_accuracy_and_f1(preds, golds):
+    golds = np.array(golds)
+    preds = np.array(preds)
+    acc = float((preds == golds).mean())
+    f1 = float(f1_score(y_true=golds, y_pred=preds))
+    minor = {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+    return {
+        "major": minor["acc_and_f1"],
+        "minor": minor,
+        "higher_is_better": True,
+    }
+
+
 @TASK_REGISTRY.register("cola")
 class CoLA(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "cola"
+
    def has_training_docs(self):
        return True

@@ -17,27 +37,25 @@ class CoLA(NLP_TASK):
    def has_test_docs(self):
        return True

+    def fewshot_description(self):
+        return "Does this sentence make sense?:\tTrue or False?"
+
    def doc_to_text(self, doc, include_target=True):
-        text = "Does this sentence make sense?:\tTrue or False?" \
-               "\nsentence:{}\nAnswer: ".format(doc["sentence"])
+        text = "\nSentence:{}\nAnswer: ".format(doc["sentence"])
        if include_target:
            text += " {}".format({1: "True", 0: "False"}[doc["label"]])
        return text

-    def evaluate(self, docs, lm, k=0):
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
        golds = [doc["label"] for doc in docs]
        preds = []
-        for doc in docs:
-            word = lm.generate(
-                context=self.fewshot_context(doc=doc, k=k),
-                max_gen_length=1,
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
            )
-            if word.strip() == "True":
-                preds.append(1)
-            elif word.strip() == "False":
-                preds.append(0)
-            else:
-                preds.append(-1)
+            preds.append(lm.loglikelihood(ctx, ' True') > lm.loglikelihood(ctx, ' False'))
        golds = np.array(golds)
        preds = np.array(preds)
        mcc = float(matthews_corrcoef(y_true=golds, y_pred=preds))
@@ -50,6 +68,9 @@ class CoLA(NLP_TASK):

 @TASK_REGISTRY.register("mnli")
 class MNLI(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "mnli"
+
    def has_training_docs(self):
        return True

@@ -69,8 +90,8 @@ class MNLI(NLP_TASK):

    def doc_to_text(self, doc, include_target=True):
        text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
-            doc["sentence1"],
-            doc["sentence2"],
+            doc["premise"],
+            doc["hypothesis"],
        )
        if include_target:
            # True = entailment
@@ -79,26 +100,65 @@ class MNLI(NLP_TASK):
            text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
        return text

-    def evaluate(self, docs, lm, k=0):
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
        golds = [doc["label"] for doc in docs]
        preds = []
-        for doc in docs:
-            word = lm.generate(
-                context=self.fewshot_context(doc=doc, k=k),
-                max_gen_length=1,
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
            )
-            if word.strip() == "True":
-                preds.append(1)
-            elif word.strip() == "False":
-                preds.append(0)
-            else:
-                preds.append(-1)
+            probs = np.array([
+                self.lm.loglikelihood(ctx, ' True'),
+                self.lm.loglikelihood(ctx, ' Neither'),
+                self.lm.loglikelihood(ctx, ' False'),
+            ])
+            preds.append(np.argmax(probs))
        return simple_accuracy_metric(preds=preds, golds=golds)


+@TASK_REGISTRY.register("mrpc")
+class MRPC(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "mrpc"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def fewshot_description(self):
+        return "Indicate if both sentences mean the same thing."
+
+    def doc_to_text(self, doc, include_target=True):
+        text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
+            doc["sentence1"],
+            doc["sentence2"],
+        )
+        if include_target:
+            text += " {}".format(yesno(doc["label"]))
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
+        return get_accuracy_and_f1(preds=preds, golds=golds)
+
+
 @TASK_REGISTRY.register("rte")
 class RTE(NLP_TASK):
-
    NLP_PATH = "glue"
    NLP_NAME = "rte"

@@ -120,18 +180,230 @@ class RTE(NLP_TASK):
            text += " {}".format({1: "True", 0: "False"}[doc["label"]])
        return text

-    def evaluate(self, docs, lm, k=0):
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(lm.loglikelihood(ctx, ' True') > lm.loglikelihood(ctx, ' False'))
+        return simple_accuracy_metric(preds=preds, golds=golds)
+
+
+@TASK_REGISTRY.register("qnli")
+class QNLI(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "qnli"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def doc_to_text(self, doc, include_target=True):
+        text = "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
+            doc["question"],
+            doc["sentence"],
+        )
+        if include_target:
+            # True = entailment
+            # False = not entailment
+            text += " {}".format({0: "True", 1: "False"}[doc["label"]])
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(self.lm.loglikelihood(ctx, ' False') > self.lm.loglikelihood(ctx, ' True'))
+        return simple_accuracy_metric(preds=preds, golds=golds)
+
+
+@TASK_REGISTRY.register("qqp")
+class QQP(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "qqp"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def fewshot_description(self):
+        return "Indicate if both sentences mean the same thing."
+
+    def doc_to_text(self, doc, include_target=True):
+        text = "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
+            doc["question1"],
+            doc["question2"],
+        )
+        if include_target:
+            text += " {}".format(yesno(doc["label"]))
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
+        return get_accuracy_and_f1(preds=preds, golds=golds)
+
+
+@TASK_REGISTRY.register("stsb")
+class STSB(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "stsb"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def fewshot_description(self):
+        return "Indicate if both sentences mean the same thing from a scale of 0-5, " \
+           "where 5 means identical and 0 means unrelated."
+
+    def doc_to_text(self, doc, include_target=True):
+        text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
+            doc["sentence1"],
+            doc["sentence2"],
+        )
+        if include_target:
+            text += " {}".format(yesno(doc["label"]))
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
        golds = [doc["label"] for doc in docs]
        preds = []
-        for doc in docs:
-            word = lm.generate(
-                context=self.fewshot_context(doc=doc, k=k),
-                max_gen_length=1,
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
            )
-            if word.strip() == "True":
-                preds.append(1)
-            elif word.strip() == "False":
-                preds.append(0)
+            output = lm.generate(context=ctx, max_gen_length=5).strip()
+            first_element = output.split()[0]
+            if first_element.isnumeric():
+                pred = max(min(float(first_element), 5.0), 0.0)
            else:
-                preds.append(-1)
+                pred = 2.5
+            preds.append(pred)
+        pearson_corr = float(pearsonr(preds, golds)[0])
+        spearman_corr = float(spearmanr(preds, golds)[0])
+        minor = {
+            "pearson": pearson_corr,
+            "spearmanr": spearman_corr,
+            "corr": (pearson_corr + spearman_corr) / 2,
+        }
+        return {
+            "major": minor["corr"],
+            "minor": minor,
+            "higher_is_better": True,
+        }
+
+
+@TASK_REGISTRY.register("sst")
+class SST(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "sst2"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def fewshot_description(self):
+        return "Indicate if each sentence is Positive or Negative."
+
+    def doc_to_text(self, doc, include_target=True):
+        text = "sentence:\t{}\t\nanswer:".format(
+            doc["sentence"],
+        )
+        if include_target:
+            text += " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(lm.loglikelihood(ctx, ' Positive') > lm.loglikelihood(ctx, ' Negative'))
+        return simple_accuracy_metric(preds=preds, golds=golds)
+
+
+@TASK_REGISTRY.register("wnli")
+class WNLI(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "wnli"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def doc_to_text(self, doc, include_target=True):
+        text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
+            doc["premise"],
+            doc["hypothesis"],
+        )
+        if include_target:
+            # True = entailment
+            # False = contradiction
+            # Neither = neutral
+            text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            probs = np.array([
+                self.lm.loglikelihood(ctx, ' True'),
+                self.lm.loglikelihood(ctx, ' Neither'),
+                self.lm.loglikelihood(ctx, ' False'),
+            ])
+            preds.append(np.argmax(probs))
        return simple_accuracy_metric(preds=preds, golds=golds)
--- a/tasks/superglue.py
+++ b/tasks/superglue.py
+from . common import NLP_TASK, simple_accuracy_metric, yesno
+from . import TASK_REGISTRY
+
+
+@TASK_REGISTRY.register("boolq")
+class BoolQ(NLP_TASK):
+    NLP_PATH = "superglue"
+    NLP_NAME = "boolq"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def fewshot_description(self):
+        return "Read the following passages and answer each question with a yes or a no."
+
+    def doc_to_text(self, doc, include_target=True):
+        return f"{doc['passage']}\nquestion: {doc['question']}\nanswer: " \
+            + (yesno(doc['answer']) if include_target else "")
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["answer"] for doc in docs]
+        preds = []
+        for doc in docs:
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
+        return simple_accuracy_metric(preds=preds, golds=golds)