Merge pull request #3 from zphang/refactor

LM Eval Refactor; GPT-3; GLUE tasks

Merge pull request #3 from zphang/refactor
LM Eval Refactor; GPT-3; GLUE tasks
ffbaef21 · Stella Biderman · GitHub · 7a32afeb · e7a87e71 · 7a32afeb
Unverified Commit ffbaef21 authored Sep 09, 2020 by Stella Biderman Committed by GitHub Sep 09, 2020
18 changed files
--- a/base.py
+++ b/base.py
-import abc
-import random
-class LM(abc.ABC):
-    @abc.abstractmethod
-    def generate(self, context, until):
-        pass
-    @abc.abstractmethod
-    def loglikelihood(self, context, continuation):
-        pass
-class Dataset(abc.ABC):
-    @abc.abstractmethod
-    def has_training_docs(self):
-        pass
-    @abc.abstractmethod
-    def has_validation_docs(self):
-        pass
-    @abc.abstractmethod
-    def training_docs(self):
-        pass
-    @abc.abstractmethod
-    def validation_docs(self):
-        pass
-    @abc.abstractmethod
-    def test_docs(self):
-        pass
-    def fewshot_examples(self, k):
-        traindocs = list(self.training_docs())
-        random.seed(123)
-        random.shuffle(traindocs)
-        return traindocs[:k]
-    @abc.abstractmethod
-    def fewshot_description(self):
-        pass
-    @abc.abstractmethod
-    def doc_to_text(self, doc, include_target=True):
-        pass
-    @abc.abstractmethod
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        pass
--- a/download_all.sh
+++ b/download_all.sh
+# NLP generally do not require separately downloading data
 #coqa
 mkdir -p data/coqa

--- a/gpt2.py
+++ b/gpt2.py
-import transformers
-from base import LM
-import torch
-import torch.nn.functional as F
-class GPT2LM(LM):
-    def __init__(self, dev='cpu'):
-        self.gpt2 = transformers.GPT2LMHeadModel.from_pretrained('gpt2').to(dev)
-        self.tok = transformers.GPT2Tokenizer.from_pretrained('gpt2')
-        self.dev = dev
-    def generate(self, context, until):
-        context = torch.tensor([self.tok.encode(context.strip())], dtype=torch.long).to(self.dev)
-        res = self.gpt2.generate(context, eos_token_id=self.tok.encoder[until], do_sample=False, max_length=1024)
-        # chop off the prompt and the final eos token
-        return self.tok.decode(res[0][len(context[0]):-1]).strip()
-    def loglikelihood(self, context, continuation):
-        print('likelihood:', context, continuation)
-        inp = torch.tensor([self.tok.encode(context + continuation)], dtype=torch.long).to(self.dev)
-        ctxlen = len(self.tok.encode(context.strip()))
-        cont_toks = inp[:, ctxlen:] # [batch, seq]
-        logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1] # [batch, seq, vocab]
-        return torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)
--- a/hf.py
+++ b/hf.py
-import base
-import nlp
-def yesno(x):
-    if x: return 'yes'
-    else: return 'no'
-def mean(x):
-    return sum(x) / len(x)
-class BoolQ(base.Dataset):
-    def __init__(self):
-        self.dataset = nlp.load_dataset('boolq')
-    def training_docs(self):
-        yield from self.dataset['train']
-    def validation_docs(self):
-        yield from self.dataset['validation']
-    def test_docs(self):
-        return []
-    def fewshot_description(self):
-        return "Read the following passages and answer each question with a yes or a no."
-    def doc_to_text(self, doc, include_target=True):    
-        return f"{doc['passage']}\nquestion: {doc['question']}\nanswer: " + (yesno(doc['answer']) if include_target else "")
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        acc = []
-        for doc in docs:
-            ctx = '\n\n'.join(map(self.doc_to_text, self.fewshot_examples(k=num_fewshot))) + '\n\n'
-            ctx += self.doc_to_text(doc, include_target=False).strip()
-            ctx = ((self.fewshot_description() + "\n\n") if provide_description else "") + ctx
-            ans = lm.loglikelihood(ctx, 'yes') > lm.loglikelihood(ctx, 'no')
-            acc.append(int(ans == doc['answer']))
-        return mean(acc)
\ No newline at end of file
--- a/lm_eval/__init__.py
+++ b/lm_eval/__init__.py
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
+import abc
+import random
+class LM(abc.ABC):
+    @abc.abstractmethod
+    def generate(self, context, max_gen_length):
+        """Conditional text generation with an LM
+        :param context: str
+            Context string for conditional generation
+        :param max_gen_length: int
+            Maximum number of tokens to generate
+        :return: str
+        """
+        pass
+    @abc.abstractmethod
+    def loglikelihood(self, context, continuation):
+        """Compute log-likelihood of a generation a continuation from a context
+        Assume that the final text will simple be
+            context + continuation
+        :param context: str
+            Context string for conditional generation
+        :param continuation: str
+            Maximum number of tokens to generate
+        :return: float
+        """
+        pass
+    @classmethod
+    def create_from_arg_string(cls, arg_string):
+        """Constructor method, in case models need additional arguments
+        e.g. OpenAI API engine, paths for loading, other params
+        :param arg_string: str
+            Left up to individual model class to handle
+        """
+        return cls()
+class Dataset(abc.ABC):
+    @abc.abstractmethod
+    def has_training_docs(self):
+        """Whether the task has a training set"""
+        pass
+    @abc.abstractmethod
+    def has_validation_docs(self):
+        """Whether the task has a validation set"""
+        pass
+    @abc.abstractmethod
+    def has_test_docs(self):
+        """Whether the task has a test set"""
+        pass
+    @abc.abstractmethod
+    def training_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        pass
+    @abc.abstractmethod
+    def validation_docs(self):
+        pass
+    @abc.abstractmethod
+    def test_docs(self):
+        pass
+    def fewshot_examples(self, k):
+        traindocs = list(self.training_docs())
+        random.shuffle(traindocs)
+        return traindocs[:k]
+    @abc.abstractmethod
+    def doc_to_text(self, doc, include_target=True):
+        pass
+    @abc.abstractmethod
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        """Take iterable of docs and evaluates, returning a dict with the following format:
+        {
+            "major": float,
+            "minor": dict,
+            "higher_is_better": bool,
+        }
+        * `major` should be a single, representative number, for programmatic comparison
+        * `minor` should be a dictionary containing all relevant sub-metrics
+        * `higher_is_better` determines whether a higher metric is better
+        """
+        pass
+    def fewshot_description(self):
+        return ""
+    def fewshot_context(self, doc, num_fewshot, provide_description):
+        raw_description = self.fewshot_description()
+        description = (raw_description + "\n\n") if provide_description and raw_description else ""
+        labeled_examples = "\n\n".join(
+            map(self.doc_to_text, self.fewshot_examples(k=num_fewshot))
+        ) + "\n\n"
+        example = self.doc_to_text(doc, include_target=False).strip()
+        return description + labeled_examples + example
+class Registry:
+    def __init__(self, registry_name):
+        self.registry_name = registry_name
+        self.registry = {}
+    def register(self, name):
+        def register_cls(new_cls):
+            if name in self.registry:
+                raise ValueError('Cannot register duplicate ({})'.format(self.registry_name, name))
+            self.registry[name] = new_cls
+            return new_cls
+        return register_cls
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
+import importlib
+import os
+from lm_eval.base import Registry
+MODEL_REGISTRY = Registry(registry_name="models")
+# Load all modules in models directory to populate registry
+models_dir = os.path.dirname(__file__)
+for file in os.listdir(models_dir):
+    path = os.path.join(models_dir, file)
+    if (
+        not file.startswith('_')
+        and not file.startswith('.')
+        and (file.endswith('.py') or os.path.isdir(path))
+    ):
+        module_name = file[:file.find('.py')] if file.endswith('.py') else file
+        module = importlib.import_module('lm_eval.models.' + module_name)
+def get_model(model_name):
+    return MODEL_REGISTRY.registry[model_name]
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
+from lm_eval.base import LM
+from . import MODEL_REGISTRY
+@MODEL_REGISTRY.register("dummy")
+class DummyLM(LM):
+    def generate(self, context, max_gen_length):
+        return "lol"
+    def loglikelihood(self, context, continuation):
+        return 0.0
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
+import transformers
+import torch
+import torch.nn.functional as F
+from lm_eval.base import LM
+from lm_eval import utils
+from . import MODEL_REGISTRY
+@MODEL_REGISTRY.register("gpt2")
+class GPT2LM(LM):
+    def __init__(self, device="cpu"):
+        self.device = torch.device(device)
+        self.gpt2 = transformers.GPT2LMHeadModel.from_pretrained('gpt2').to(self.device)
+        self.tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
+    @classmethod
+    def create_from_arg_string(cls, arg_string):
+        args = utils.simple_parse_args_string(arg_string)
+        return cls(device=args.get("device", "cpu"))
+    def generate(self, context, max_gen_length):
+        context = torch.tensor([self.tokenizer.encode(context.strip())], dtype=torch.long).to(self.device)
+        res = self.gpt2.generate(
+            context,
+            eos_token_id=self.tokenizer.eos_token_id,
+            do_sample=False,
+            max_length=max_gen_length,
+        )
+        # chop off the prompt and the final eos token
+        return self.tokenizer.decode(res[0][len(context[0]):-1]).strip()
+    def loglikelihood(self, context, continuation):
+        inp = torch.tensor([self.tokenizer.encode(context + continuation)], dtype=torch.long).to(self.device)
+        ctxlen = len(self.tokenizer.encode(context.strip()))
+        cont_toks = inp[:, ctxlen:] # [batch, seq]
+        logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1]  # [batch, seq, vocab]
+        return torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)
--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
+import os
+import openai
+import transformers
+from lm_eval.base import LM
+from lm_eval import utils
+from . import MODEL_REGISTRY
+@MODEL_REGISTRY.register("gpt3")
+class GPT3LM(LM):
+    def __init__(self, engine):
+        self.engine = engine
+        self.tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
+        # Read from environment variable OPENAI_API_SECRET_KEY
+        openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
+    @classmethod
+    def create_from_arg_string(cls, arg_string):
+        args = utils.simple_parse_args_string(arg_string)
+        return cls(engine=args.get("engine", "davinci"))
+    def generate(self, context, max_gen_length):
+        response = openai.Completion.create(
+            engine=self.engine,
+            prompt=context,
+            max_tokens=max_gen_length,
+            temperature=0.0,
+        )
+        return response.choices[0]["text"]
+    def loglikelihood(self, context, continuation):
+        full_text = context + continuation
+        full_text_length = len(self.tokenizer.tokenize(full_text))
+        context_length = len(self.tokenizer.tokenize(context))
+        continuation_length = len(self.tokenizer.tokenize(continuation))
+        assert full_text_length == context_length + continuation_length
+        response = openai.Completion.create(
+            engine=self.engine,
+            prompt=full_text,
+            echo=True,
+            max_tokens=0, temperature=0.0,
+            logprobs=0,
+        )
+        logprobs = response.choices[0]["logprobs"]["token_logprobs"]
+        continuation_logprobs = logprobs[-continuation_length:]
+        return sum(continuation_logprobs)
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
+import importlib
+import os
+from lm_eval.base import Registry
+TASK_REGISTRY = Registry(registry_name="tasks")
+# Load all modules in models directory to populate registry
+tasks_dir = os.path.dirname(__file__)
+for file in os.listdir(tasks_dir):
+    path = os.path.join(tasks_dir, file)
+    if (
+        not file.startswith('_')
+        and not file.startswith('.')
+        and (file.endswith('.py') or os.path.isdir(path))
+    ):
+        module_name = file[:file.find('.py')] if file.endswith('.py') else file
+        module = importlib.import_module('lm_eval.tasks.' + module_name)
+ALL_TASKS = sorted(list(TASK_REGISTRY.registry))
+def get_task(model_name):
+    return TASK_REGISTRY.registry[model_name]
--- a/lm_eval/tasks/common.py
+++ b/lm_eval/tasks/common.py
+import nlp
+import numpy as np
+import random
+from ..base import Dataset
+class NLP_TASK(Dataset):
+    NLP_PATH = None
+    NLP_NAME = None
+    def _load_nlp_dataset(self):
+        return nlp.load_dataset(path=self.NLP_PATH, name=self.NLP_NAME)
+    def training_docs(self):
+        if self.has_training_docs():
+            return self._load_nlp_dataset()["train"]
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self._load_nlp_dataset()["validation"]
+    def test_docs(self):
+        if self.has_test_docs():
+            return self._load_nlp_dataset()["test"]
+    def fewshot_examples(self, k):
+        training_docs = self.training_docs()
+        n = len(training_docs)
+        indices = random.sample(range(n), k)
+        return [training_docs[i] for i in indices]
+def simple_accuracy_metric(preds, golds):
+    acc = float((np.array(preds) == np.array(golds)).mean())
+    return {
+        "major": acc,
+        "minor": {"acc": acc},
+        "higher_is_better": True,
+    }
+def yesno(x):
+    if x:
+        return 'yes'
+    else:
+        return 'no'
--- a/coqa.py
+++ b/coqa.py
-from base import Dataset
-import os
 import json
 import random
+from lm_eval.base import Dataset
+from . import TASK_REGISTRY
+@TASK_REGISTRY.register("coqa")
 class CoQA(Dataset):
    def has_training_docs(self):
        return True

--- a/evaluate-v1.0.py
+++ b/evaluate-v1.0.py
--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import NLP_TASK, simple_accuracy_metric, yesno
+from . import TASK_REGISTRY
+def get_accuracy_and_f1(preds, golds):
+    golds = np.array(golds)
+    preds = np.array(preds)
+    acc = float((preds == golds).mean())
+    f1 = float(f1_score(y_true=golds, y_pred=preds))
+    minor = {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+    return {
+        "major": minor["acc_and_f1"],
+        "minor": minor,
+        "higher_is_better": True,
+    }
+@TASK_REGISTRY.register("cola")
+class CoLA(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "cola"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def fewshot_description(self):
+        return "Does this sentence make sense?:\tTrue or False?"
+    def doc_to_text(self, doc, include_target=True):
+        text = "\nSentence:{}\nAnswer: ".format(doc["sentence"])
+        if include_target:
+            text += " {}".format({1: "True", 0: "False"}[doc["label"]])
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(lm.loglikelihood(ctx, ' True') > lm.loglikelihood(ctx, ' False'))
+        golds = np.array(golds)
+        preds = np.array(preds)
+        mcc = float(matthews_corrcoef(y_true=golds, y_pred=preds))
+        return {
+            "major": mcc,
+            "minor": {"mcc": mcc},
+            "higher_is_better": True,
+        }
+@TASK_REGISTRY.register("mnli")
+class MNLI(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "mnli"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self._load_nlp_dataset()["validation_matched"]
+    def test_docs(self):
+        if self.has_test_docs():
+            return self._load_nlp_dataset()["test_matched"]
+    def doc_to_text(self, doc, include_target=True):
+        text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
+            doc["premise"],
+            doc["hypothesis"],
+        )
+        if include_target:
+            # True = entailment
+            # False = contradiction
+            # Neither = neutral
+            text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            probs = np.array([
+                self.lm.loglikelihood(ctx, ' True'),
+                self.lm.loglikelihood(ctx, ' Neither'),
+                self.lm.loglikelihood(ctx, ' False'),
+            ])
+            preds.append(np.argmax(probs))
+        return simple_accuracy_metric(preds=preds, golds=golds)
+@TASK_REGISTRY.register("mrpc")
+class MRPC(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "mrpc"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def fewshot_description(self):
+        return "Indicate if both sentences mean the same thing."
+    def doc_to_text(self, doc, include_target=True):
+        text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
+            doc["sentence1"],
+            doc["sentence2"],
+        )
+        if include_target:
+            text += " {}".format(yesno(doc["label"]))
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
+        return get_accuracy_and_f1(preds=preds, golds=golds)
+@TASK_REGISTRY.register("rte")
+class RTE(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "rte"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def doc_to_text(self, doc, include_target=True):
+        text = "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
+            doc["sentence1"],
+            doc["sentence2"],
+        )
+        if include_target:
+            # 0 = entailment
+            # 1 = not_entailment
+            text += " {}".format({0: "True", 1: "False"}[doc["label"]])
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(lm.loglikelihood(ctx, ' False') > lm.loglikelihood(ctx, ' True'))
+        return simple_accuracy_metric(preds=preds, golds=golds)
+@TASK_REGISTRY.register("qnli")
+class QNLI(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "qnli"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def doc_to_text(self, doc, include_target=True):
+        text = "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
+            doc["question"],
+            doc["sentence"],
+        )
+        if include_target:
+            # True = entailment
+            # False = not entailment
+            text += " {}".format({0: "True", 1: "False"}[doc["label"]])
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(self.lm.loglikelihood(ctx, ' False') > self.lm.loglikelihood(ctx, ' True'))
+        return simple_accuracy_metric(preds=preds, golds=golds)
+@TASK_REGISTRY.register("qqp")
+class QQP(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "qqp"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def fewshot_description(self):
+        return "Indicate if both sentences mean the same thing."
+    def doc_to_text(self, doc, include_target=True):
+        text = "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
+            doc["question1"],
+            doc["question2"],
+        )
+        if include_target:
+            text += " {}".format(yesno(doc["label"]))
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
+        return get_accuracy_and_f1(preds=preds, golds=golds)
+@TASK_REGISTRY.register("stsb")
+class STSB(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "stsb"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def fewshot_description(self):
+        return "Indicate if both sentences mean the same thing from a scale of 0-5, " \
+           "where 5 means identical and 0 means unrelated."
+    def doc_to_text(self, doc, include_target=True):
+        text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
+            doc["sentence1"],
+            doc["sentence2"],
+        )
+        if include_target:
+            text += " {}".format(yesno(doc["label"]))
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            output = lm.generate(context=ctx, max_gen_length=5).strip()
+            first_element = output.split()[0]
+            if first_element.isnumeric():
+                pred = max(min(float(first_element), 5.0), 0.0)
+            else:
+                pred = 2.5
+            preds.append(pred)
+        pearson_corr = float(pearsonr(preds, golds)[0])
+        spearman_corr = float(spearmanr(preds, golds)[0])
+        minor = {
+            "pearson": pearson_corr,
+            "spearmanr": spearman_corr,
+            "corr": (pearson_corr + spearman_corr) / 2,
+        }
+        return {
+            "major": minor["corr"],
+            "minor": minor,
+            "higher_is_better": True,
+        }
+@TASK_REGISTRY.register("sst")
+class SST(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "sst2"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def fewshot_description(self):
+        return "Indicate if each sentence is Positive or Negative."
+    def doc_to_text(self, doc, include_target=True):
+        text = "sentence:\t{}\t\nanswer:".format(
+            doc["sentence"],
+        )
+        if include_target:
+            text += " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(lm.loglikelihood(ctx, ' Positive') > lm.loglikelihood(ctx, ' Negative'))
+        return simple_accuracy_metric(preds=preds, golds=golds)
+@TASK_REGISTRY.register("wnli")
+class WNLI(NLP_TASK):
+    NLP_PATH = "glue"
+    NLP_NAME = "wnli"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def doc_to_text(self, doc, include_target=True):
+        text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
+            doc["premise"],
+            doc["hypothesis"],
+        )
+        if include_target:
+            # True = entailment
+            # False = contradiction
+            # Neither = neutral
+            text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in tqdm_lib.tqdm(docs):
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            probs = np.array([
+                self.lm.loglikelihood(ctx, ' True'),
+                self.lm.loglikelihood(ctx, ' Neither'),
+                self.lm.loglikelihood(ctx, ' False'),
+            ])
+            preds.append(np.argmax(probs))
+        return simple_accuracy_metric(preds=preds, golds=golds)
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
+from . common import NLP_TASK, simple_accuracy_metric, yesno
+from . import TASK_REGISTRY
+@TASK_REGISTRY.register("boolq")
+class BoolQ(NLP_TASK):
+    NLP_PATH = "super_glue"
+    NLP_NAME = "boolq"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def fewshot_description(self):
+        return "Read the following passages and answer each question with a yes or a no."
+    def doc_to_text(self, doc, include_target=True):
+        return f"{doc['passage']}\nquestion: {doc['question']}\nanswer: " \
+            + (yesno(doc['label']) if include_target else "")
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        golds = [doc["label"] for doc in docs]
+        preds = []
+        for doc in docs:
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
+        return simple_accuracy_metric(preds=preds, golds=golds)
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
+def simple_parse_args_string(args_string):
+    """
+    Parses something like
+        args1=val1,arg2=val2
+    Into a dictionary
+    """
+    args_string = args_string.strip()
+    if not args_string:
+        return {}
+    arg_list = args_string.split(",")
+    args_dict = {}
+    for arg in arg_list:
+        k, v = arg.split("=")
+        args_dict[k] = v
+    return args_dict
--- a/main.py
+++ b/main.py
-from gpt2 import GPT2LM
+import argparse
+import json
+import numpy as np
+import random
-lm = GPT2LM()
+from lm_eval import models, tasks
-print(lm.generate('1 + 1 = 2.\n3 + 5 = 8.\n4 + 9 = 13.\n4 + 3 = 7.\n2 + 3 =', '.'))
\ No newline at end of file
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', required=True)
+    parser.add_argument('--model_args', default="")
+    parser.add_argument('--tasks', default="all_tasks")
+    parser.add_argument('--provide_description', action="store_true")
+    parser.add_argument('--num_fewshot', type=int, default=1)
+    parser.add_argument('--seed', type=int, default=1234)
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    lm = models.get_model(args.model).create_from_arg_string(args.model_args)
+    if args.tasks == "all_tasks":
+        task_names = tasks.ALL_TASKS
+    else:
+        task_names = args.tasks.split(",")
+    task_dict = {
+        task_name: tasks.get_task(task_name)()
+        for task_name in task_names
+    }
+    results = {}
+    for task_name, task in task_dict.items():
+        if not task.has_validation_docs():
+            continue
+        result = task.evaluate(
+            docs=task.validation_docs(),
+            lm=lm,
+            provide_description=args.provide_description,
+            num_fewshot=args.num_fewshot,
+        )
+        results[task_name] = result
+    print(json.dumps(results, indent=2))
+if __name__ == "__main__":
+    main()