Merge pull request #4 from zphang/superglue_part_1

SuperGLUE part 1

Merge pull request #4 from zphang/superglue_part_1
SuperGLUE part 1
635a2155 · Stella Biderman · GitHub · ffbaef21 · f1ec7b06 · 635a2155
Unverified Commit 635a2155 authored Sep 16, 2020 by Stella Biderman Committed by GitHub Sep 16, 2020
11 changed files
--- a/README.md
+++ b/README.md
@@ -9,6 +9,83 @@ The goal of this project is to build a set of tools for evaluating LMs on typica

 The raw Google doc can be found here: https://docs.google.com/document/d/177dwJpH8GHebISXYZSn4NL98sXdCtQMH82b7O5F7jmw/edit?usp=sharing

+## Usage
+
+### Evaluate a task
+
+To evaluate a model, (e.g. GPT-2) on NLU tasks (e.g. RTE, Winograd Scheme Challenge), you can run the following command.
+
+```bash
+python main.py \
+	--model gpt2 \
+	--model_args device=cuda:0 \
+	--tasks rte,wsc \
+	--provide_description \
+	--num_fewshot 2
+```
+
+If you have access to an OpenAI API key, you can also evaluate GPT-3 on various tasks with the following command:
+
+```bash
+export OPENAI_API_SECRET_KEY=YOUR_KEY_HERE
+python main.py \
+	--model gpt3 \
+	--tasks rte,wsc \
+	--provide_description \
+	--num_fewshot 2
+```
+
+To inspect what the LM inputs look like, you can run the following command:
+
+```bash
+python write_out.py \
+	--tasks all_tasks \
+	--provide_description \
+	--num_fewshot 5 \
+	--num_examples 10 \
+	--output_base_path /path/to/output/folder
+```
+
+This will write out one text file for each task.
+
+### Code Structure
+
+There are two major components of the library:
+
+1. LMs (language models), e.g. GPT-2, GPT-3
+2. Tasks, e.g. MNLI, RTE, SQuAD (coming soon)
+
+Both LMs (`lm_eval.models`) and Tasks (`lm_eval.tasks`) are kept in a registry data structure, for easy CLI instantiation.
+
+**If you want to extend either models or tasks, simply add a new LM or Task subclass, and decorate with the registry decorator**.
+
+**GLUE**
+- [X] CoLA
+- [X] MNLI
+- [X] MRPC
+- [X] RTE
+- [X] QNLI
+- [X] QQP
+- [X] STS-B
+- [X] SST-2
+- [X] WNLI
+
+**SuperGLUE**
+- [X] BoolQ
+- [X] CommitmentBank
+- [X] COPA
+- [ ] MultiRC
+- [ ] ReCoRD
+- [X] RTE (See: GLUE)
+- [X] WiC
+- [X] WSC
+
+**QA Tasks**
+- [ ] CoQA 
+- [ ] DROP
+
+## Description
+
 ### 1. LM Evaluation
 Given an LM, we want to evaluate it on a wide range of NLU tasks. We should at least cover the set of tasks in the GPT-3 paper, and any other tasks/benchmarks that are relevant. We will follow the GPT-3 format of a) zero-shot, b) one-shot, c) few-shot evaluation.

@@ -49,8 +126,5 @@ This part is the easiest. I guess we just write out some text files containing t
 ## Summary (need to convert from google docs at some point):
 https://docs.google.com/document/d/177dwJpH8GHebISXYZSn4NL98sXdCtQMH82b7O5F7jmw/edit?usp=sharing

-## Current Datasets:
-
-[] CoQA 
-[] DROP
+## Current Tasks:

--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -31,6 +31,16 @@ class LM(abc.ABC):
        """
        pass

+    @classmethod
+    def num_tokens(cls, string):
+        """Return the number of tokens in a string, based on tokenization
+
+        :param string: str
+            Input string
+        :return: int
+        """
+        pass
+
    @classmethod
    def create_from_arg_string(cls, arg_string):
        """Constructor method, in case models need additional arguments
@@ -106,7 +116,7 @@ class Dataset(abc.ABC):

    def fewshot_context(self, doc, num_fewshot, provide_description):
        raw_description = self.fewshot_description()
-        description = (raw_description + "\n\n") if provide_description and raw_description else ""
+        description = (raw_description + "\n===\n\n") if provide_description and raw_description else ""
        labeled_examples = "\n\n".join(
            map(self.doc_to_text, self.fewshot_examples(k=num_fewshot))
        ) + "\n\n"

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -18,23 +18,26 @@ class GPT2LM(LM):
        args = utils.simple_parse_args_string(arg_string)
        return cls(device=args.get("device", "cpu"))

-    def generate(self, context, max_gen_length):
-        context = torch.tensor([self.tokenizer.encode(context.strip())], dtype=torch.long).to(self.device)
+    def generate(self, context, max_gen_length, truncate=True):
+        context_tensor = torch.tensor([self.tokenizer.encode(context.strip())], dtype=torch.long).to(self.device)
        res = self.gpt2.generate(
-            context,
+            context_tensor,
            eos_token_id=self.tokenizer.eos_token_id,
            do_sample=False,
-            max_length=max_gen_length,
+            max_length=self.num_tokens(context) + max_gen_length,
        )

        # chop off the prompt and the final eos token
        return self.tokenizer.decode(res[0][len(context[0]):-1]).strip()

-    def loglikelihood(self, context, continuation):
+    def loglikelihood(self, context, continuation, truncate=True):
        inp = torch.tensor([self.tokenizer.encode(context + continuation)], dtype=torch.long).to(self.device)
        ctxlen = len(self.tokenizer.encode(context.strip()))

-        cont_toks = inp[:, ctxlen:] # [batch, seq]
+        cont_toks = inp[:, ctxlen:]  # [batch, seq]
        logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1]  # [batch, seq, vocab]

        return torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)
+
+    def num_tokens(self, string):
+        return len(self.tokenizer.tokenize(string))
--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
 import os
-import openai
 import transformers
 from lm_eval.base import LM
 from lm_eval import utils
@@ -8,9 +7,22 @@ from . import MODEL_REGISTRY

 @MODEL_REGISTRY.register("gpt3")
 class GPT3LM(LM):
-    def __init__(self, engine):
+
+    MAX_LENGTH = 2048
+
+    def __init__(self, engine, truncate=False):
+        """
+
+        :param engine: str
+            OpenAI API engine (e.g. davinci)
+        :param truncate: bool
+            Truncate input if too long (if False and input is too long, throw error)
+        """
+        import openai
        self.engine = engine
        self.tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
+        self.truncate = truncate
+
        # Read from environment variable OPENAI_API_SECRET_KEY
        openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]

@@ -20,23 +32,34 @@ class GPT3LM(LM):
        return cls(engine=args.get("engine", "davinci"))

    def generate(self, context, max_gen_length):
+        import openai
+        if self.truncate:
+            prompt = self.smart_truncate(context, buffer=max_gen_length)
+        else:
+            prompt = context
+
        response = openai.Completion.create(
            engine=self.engine,
-            prompt=context,
+            prompt=prompt,
            max_tokens=max_gen_length,
            temperature=0.0,
        )
        return response.choices[0]["text"]

    def loglikelihood(self, context, continuation):
+        import openai
        full_text = context + continuation
        full_text_length = len(self.tokenizer.tokenize(full_text))
        context_length = len(self.tokenizer.tokenize(context))
        continuation_length = len(self.tokenizer.tokenize(continuation))
        assert full_text_length == context_length + continuation_length
+        if self.truncate:
+            prompt = self.smart_truncate(full_text, buffer=0)
+        else:
+            prompt = full_text
        response = openai.Completion.create(
            engine=self.engine,
-            prompt=full_text,
+            prompt=prompt,
            echo=True,
            max_tokens=0, temperature=0.0,
            logprobs=0,
@@ -44,3 +67,13 @@ class GPT3LM(LM):
        logprobs = response.choices[0]["logprobs"]["token_logprobs"]
        continuation_logprobs = logprobs[-continuation_length:]
        return sum(continuation_logprobs)
+
+    def smart_truncate(self, string, buffer=1):
+        tokens = self.tokenizer.tokenize(string)
+        available_length = self.MAX_LENGTH - 1 - buffer  # OpenAI adds 1 token
+        kept_tokens = tokens[-available_length:]
+        new_string = self.tokenizer.convert_tokens_to_string(kept_tokens)
+        return new_string
+
+    def num_tokens(self, string):
+        return len(self.tokenizer.tokenize(string))
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -19,5 +19,12 @@ for file in os.listdir(tasks_dir):
 ALL_TASKS = sorted(list(TASK_REGISTRY.registry))


-def get_task(model_name):
-    return TASK_REGISTRY.registry[model_name]
+def get_task(task_name):
+    return TASK_REGISTRY.registry[task_name]
+
+
+def get_task_dict(task_name_list):
+    return {
+        task_name: get_task(task_name)()
+        for task_name in task_name_list
+    }
--- a/lm_eval/tasks/common.py
+++ b/lm_eval/tasks/common.py
@@ -8,12 +8,20 @@ class NLP_TASK(Dataset):
    NLP_PATH = None
    NLP_NAME = None

+    def __init__(self):
+        super().__init__()
+        self._training_docs = None
+
    def _load_nlp_dataset(self):
        return nlp.load_dataset(path=self.NLP_PATH, name=self.NLP_NAME)

    def training_docs(self):
+        # Cache training for faster few-shot.
+        # If data is too large to fit in memory, override this method.
        if self.has_training_docs():
-            return self._load_nlp_dataset()["train"]
+            if self._training_docs is None:
+                self._training_docs = list(self._load_nlp_dataset()["train"])
+            return self._training_docs

    def validation_docs(self):
        if self.has_validation_docs():

--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
@@ -11,7 +11,10 @@ class CoQA(Dataset):

    def has_validation_docs(self):
        return False
-    
+
+    def has_test_docs(self):
+        return False
+
    def training_docs(self):
        myjson = json.load(open('data/coqa/coqa-train-v1.0.json'))['data']
        return self.load_doc(myjson)

--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -41,7 +41,7 @@ class CoLA(NLP_TASK):
        return "Does this sentence make sense?:\tTrue or False?"

    def doc_to_text(self, doc, include_target=True):
-        text = "\nSentence:{}\nAnswer: ".format(doc["sentence"])
+        text = "Sentence: {}\nAnswer:".format(doc["sentence"])
        if include_target:
            text += " {}".format({1: "True", 0: "False"}[doc["label"]])
        return text
@@ -153,7 +153,7 @@ class MRPC(NLP_TASK):
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
-            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
+            preds.append(lm.loglikelihood(ctx, 'yes') > lm.loglikelihood(ctx, 'no'))
        return get_accuracy_and_f1(preds=preds, golds=golds)


@@ -210,14 +210,14 @@ class QNLI(NLP_TASK):
        return True

    def doc_to_text(self, doc, include_target=True):
-        text = "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
+        text = "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format(
            doc["question"],
            doc["sentence"],
        )
        if include_target:
            # True = entailment
            # False = not entailment
-            text += " {}".format({0: "True", 1: "False"}[doc["label"]])
+            text += " {}".format({0: "Yes", 1: "No"}[doc["label"]])
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
@@ -248,7 +248,7 @@ class QQP(NLP_TASK):
        return True

    def fewshot_description(self):
-        return "Indicate if both sentences mean the same thing."
+        return "Indicate if both questions ask the same thing."

    def doc_to_text(self, doc, include_target=True):
        text = "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
@@ -296,7 +296,7 @@ class STSB(NLP_TASK):
            doc["sentence2"],
        )
        if include_target:
-            text += " {}".format(yesno(doc["label"]))
+            text += " {}".format(doc["label"])
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
@@ -314,6 +314,7 @@ class STSB(NLP_TASK):
                pred = max(min(float(first_element), 5.0), 0.0)
            else:
                pred = 2.5
+            import pdb; pdb.set_trace()
            preds.append(pred)
        pearson_corr = float(pearsonr(preds, golds)[0])
        spearman_corr = float(spearmanr(preds, golds)[0])
@@ -383,8 +384,8 @@ class WNLI(NLP_TASK):

    def doc_to_text(self, doc, include_target=True):
        text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
-            doc["premise"],
-            doc["hypothesis"],
+            doc["sentence1"],
+            doc["sentence2"],
        )
        if include_target:
            # True = entailment

--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
+import numpy as np
+from tqdm import auto as tqdm_lib
 from . common import NLP_TASK, simple_accuracy_metric, yesno
 from . import TASK_REGISTRY

@@ -34,3 +36,195 @@ class BoolQ(NLP_TASK):
            )
            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
        return simple_accuracy_metric(preds=preds, golds=golds)
+
+
+@TASK_REGISTRY.register("cb")
+class CommitmentBank(NLP_TASK):
+    NLP_PATH = "super_glue"
+    NLP_NAME = "cb"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def doc_to_text(self, doc, include_target=True):
+        text = "{}\nquestion:\t{}\ttrue, false or neither?\nanswer:".format(
+            doc["premise"],
+            doc["hypothesis"],
+        )
+        if include_target:
+            # True = entailment
+            # False = contradiction
+            # Neither = neutral
+            text += " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]])
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            probs = np.array([
+                lm.loglikelihood(ctx, ' true'),
+                lm.loglikelihood(ctx, ' neither'),
+                lm.loglikelihood(ctx, ' false'),
+            ])
+            preds.append(np.argmax(probs))
+        return simple_accuracy_metric(preds=preds, golds=golds)
+
+
+@TASK_REGISTRY.register("copa")
+class Copa(NLP_TASK):
+    NLP_PATH = "super_glue"
+    NLP_NAME = "copa"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def doc_to_text(self, doc, include_target=True):
+        # Drop the period
+        text = doc["premise"].strip()[:-1] + " because "
+        if include_target:
+            correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
+            # Connect the sentences
+            text += self.convert_choice(correct_choice)
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            choice1 = " " + self.convert_choice(doc["choice1"])
+            choice2 = " " + self.convert_choice(doc["choice2"])
+            preds.append(lm.loglikelihood(ctx, choice2) > lm.loglikelihood(ctx, choice1))
+        return simple_accuracy_metric(preds=preds, golds=golds)
+
+    @staticmethod
+    def convert_choice(choice):
+        return choice[0].lower() + choice[1:]
+
+
+
+@TASK_REGISTRY.register("wic")
+class WordsInContext(NLP_TASK):
+    NLP_PATH = "super_glue"
+    NLP_NAME = "wic"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def doc_to_text(self, doc, include_target=True):
+        text = "{}\n{}\nquestion\tIs the word '{}' used in the same way in the" \
+               " two sentences above?\nanswer:".format(
+                    doc["sentence1"],
+                    doc["sentence2"],
+                    doc["sentence1"][doc["start1"]:doc["end1"]],
+                )
+        if include_target:
+            text += " {}".format({0: "no", 1: "yes"}[doc["label"]])
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
+        return simple_accuracy_metric(preds=preds, golds=golds)
+
+
+@TASK_REGISTRY.register("wsc")
+class WinogradSchemaChallenge(NLP_TASK):
+    NLP_PATH = "super_glue"
+    NLP_NAME = "wsc"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():
+            if self._training_docs is None:
+                # GPT-3 Paper's format only uses positive examples
+                self._training_docs = [
+                    doc for doc in
+                    self._load_nlp_dataset()["train"]
+                    if doc["label"]
+                ]
+            return self._training_docs
+
+    def fewshot_description(self):
+        return "Final Exam with Answer Key\n" \
+           "Instructions: Please carefully read the following passages. " \
+           "For each passage, you must identify which noun the pronoun marked in *bold*" \
+           " refers to.\n====="
+
+    def doc_to_text(self, doc, include_target=True):
+        raw_passage = doc["text"]
+        passage = (
+            raw_passage[:doc["span2_index"]]
+            + "*{}*".format(doc["span2_text"])
+            + raw_passage[doc["span2_index"] + len(doc["span2_text"]):]
+        )
+        pronoun = doc["span2_text"]
+        text = (
+            f"Passage: {passage}\n"
+            + f"Question: In the passage above, what does the pronoun \"*{pronoun}*\" refer to?\n"
+            + "Answer:"
+        )
+        if include_target:
+            text += " {}".format(doc["span1_text"])
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            to_predict = " " + doc["span1_text"]
+            num_tokens = len(lm.tokenizer.tokenize(to_predict))
+            generated = lm.generate(
+                context=ctx,
+                max_gen_length=num_tokens,
+            )
+            preds.append(1 if generated == to_predict else 0)
+        return simple_accuracy_metric(preds=preds, golds=golds)
--- a/main.py
+++ b/main.py
@@ -14,6 +14,7 @@ def parse_args():
    parser.add_argument('--provide_description', action="store_true")
    parser.add_argument('--num_fewshot', type=int, default=1)
    parser.add_argument('--seed', type=int, default=1234)
+    parser.add_argument('--output_path', default=None)
    return parser.parse_args()


@@ -27,10 +28,7 @@ def main():
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
-    task_dict = {
-        task_name: tasks.get_task(task_name)()
-        for task_name in task_names
-    }
+    task_dict = tasks.get_task_dict(task_names)
    results = {}
    for task_name, task in task_dict.items():
        if not task.has_validation_docs():
@@ -42,7 +40,12 @@ def main():
            num_fewshot=args.num_fewshot,
        )
        results[task_name] = result
-    print(json.dumps(results, indent=2))
+
+    dumped = json.dumps(results, indent=2)
+    print(dumped)
+    if args.output_path:
+        with open(args.output_path, "w") as f:
+            f.write(dumped)


 if __name__ == "__main__":

--- a/write_out.py
+++ b/write_out.py
+import argparse
+import numpy as np
+import os
+import random
+
+from lm_eval import tasks
+
+EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_base_path', required=True)
+    parser.add_argument('--tasks', default="all_tasks")
+    parser.add_argument('--provide_description', action="store_true")
+    parser.add_argument('--num_fewshot', type=int, default=1)
+    parser.add_argument('--seed', type=int, default=1234)
+    parser.add_argument('--num_examples', type=int, default=1)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    if args.tasks == "all_tasks":
+        task_names = tasks.ALL_TASKS
+    else:
+        task_names = args.tasks.split(",")
+    task_dict = tasks.get_task_dict(task_names)
+    os.makedirs(args.output_base_path, exist_ok=True)
+    for task_name, task in task_dict.items():
+        if not task.has_validation_docs():
+            continue
+        docs = task.validation_docs()
+        with open(os.path.join(args.output_base_path, task_name), "w") as f:
+            for i, doc in zip(range(args.num_examples), docs):
+                f.write(EXAMPLE_DIVIDER.format(i=i))
+                ctx = task.fewshot_context(
+                    doc=doc,
+                    provide_description=args.provide_description,
+                    num_fewshot=args.num_fewshot,
+                )
+                f.write(ctx + "\n")
+
+
+if __name__ == "__main__":
+    main()