merged changes

9b933d96 · jeffhsu3 · c71dcb91 · c0fbf9e8 · 9b933d96 · 9b933d96
Commit 9b933d96 authored Feb 05, 2021 by jeffhsu3
20 changed files
--- a/README.md
+++ b/README.md
 # Evaluation Harness for Large Language Models

+![](https://github.com/EleutherAI/lm-evaluation-harness/workflows/Python%20application/badge.svg)
+[![codecov](https://codecov.io/gh/EleutherAI/lm-evaluation-harness/branch/master/graph/badge.svg?token=JSG3O2427J)](https://codecov.io/gh/EleutherAI/lm-evaluation-harness)
+
 ## Overview 

 The goal of this project is to build a set of tools for evaluating LMs on typical NLU tasks, based on evaluation of GPT-3 as described in https://arxiv.org/pdf/2005.14165.pdf. Following the initial description, this repo should support 3 functions:
@@ -7,6 +10,49 @@ The goal of this project is to build a set of tools for evaluating LMs on typica
 2. Removing task val/test data from LM training set
 3. Adding task training data to LM training set

+### Overview of Tasks
+
+|   Task Name   |Train|Val|Test|      Metrics       |
+|---------------|-----|---|----|--------------------|
+|cola           |✓    |✓  |✓   |mcc                 |
+|mnli           |✓    |✓  |✓   |acc                 |
+|mnli_mismatched|✓    |✓  |✓   |acc                 |
+|mrpc           |✓    |✓  |✓   |acc, f1             |
+|rte            |✓    |✓  |✓   |acc                 |
+|qnli           |✓    |✓  |✓   |acc                 |
+|qqp            |✓    |✓  |✓   |acc, f1             |
+|sst            |✓    |✓  |✓   |acc                 |
+|wnli           |✓    |✓  |✓   |acc                 |
+|boolq          |✓    |✓  |✓   |acc                 |
+|cb             |✓    |✓  |✓   |acc, f1             |
+|copa           |✓    |✓  |✓   |acc                 |
+|multirc        |✓    |✓  |✓   |acc                 |
+|wic            |✓    |✓  |✓   |acc                 |
+|wsc            |✓    |✓  |✓   |acc                 |
+|lambada        |     |✓  |    |perplexity, accuracy|
+|piqa           |✓    |✓  |    |acc                 |
+|arc_easy       |✓    |✓  |✓   |acc                 |
+|arc_challenge  |✓    |✓  |✓   |acc                 |
+|hellaswag      |✓    |✓  |✓   |acc                 |
+|race           |✓    |✓  |✓   |acc                 |
+|webqs          |✓    |   |✓   |acc                 |
+|wsc273         |     |   |✓   |acc                 |
+|winogrande     |✓    |✓  |✓   |acc                 |
+|anli_r1        |✓    |✓  |✓   |acc                 |
+|anli_r2        |✓    |✓  |✓   |acc                 |
+|anli_r3        |✓    |✓  |✓   |acc                 |
+|arithmetic_2da |     |✓  |    |acc                 |
+|arithmetic_2ds |     |✓  |    |acc                 |
+|arithmetic_3da |     |✓  |    |acc                 |
+|arithmetic_3ds |     |✓  |    |acc                 |
+|arithmetic_4da |     |✓  |    |acc                 |
+|arithmetic_4ds |     |✓  |    |acc                 |
+|arithmetic_5da |     |✓  |    |acc                 |
+|arithmetic_5ds |     |✓  |    |acc                 |
+|arithmetic_2dm |     |✓  |    |acc                 |
+|arithmetic_1dc |     |✓  |    |acc                 |
+
+
 ## Usage

 ### Evaluate a task

--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -2,6 +2,7 @@ import abc
 import random
 import numpy as np
 import sklearn
+import math


 class LM(abc.ABC):
@@ -58,10 +59,10 @@ class LM(abc.ABC):
        return cls()


-class Dataset(abc.ABC):
+class Task(abc.ABC):
    def __init__(self):
        self.download()
-        self._traindocs = None
+        self._training_docs = None

    def download(self):
        """Downloads the task dataset if necessary"""
@@ -71,7 +72,7 @@ class Dataset(abc.ABC):
    def has_training_docs(self):
        """Whether the task has a training set"""
        pass
-    
+
    @abc.abstractmethod
    def has_validation_docs(self):
        """Whether the task has a validation set"""
@@ -84,23 +85,29 @@ class Dataset(abc.ABC):

    def training_docs(self):
        """
-
        :return: Iterable[obj]
            A iterable of any object, that doc_to_text can handle
        """
        return []
-    
+
    def validation_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
        return []
-    
+
    def test_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
        return []
-    
-    def fewshot_examples(self, k):
-        if self._traindocs is None:
-            self._traindocs = list(self.training_docs())

-        return random.sample(self._traindocs, k)
+    def fewshot_examples(self, k):
+        if self._training_docs is None:
+            self._training_docs = list(self.training_docs())
+        return random.sample(self._training_docs, k)

    @abc.abstractmethod
    def doc_to_text(self, doc):
@@ -123,7 +130,7 @@ class Dataset(abc.ABC):
            part of the document for `doc`. 
        """
        pass
-    
+
    @abc.abstractmethod
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a 
@@ -161,7 +168,7 @@ class Dataset(abc.ABC):
    def fewshot_context(self, doc, num_fewshot, provide_description):
        raw_description = self.fewshot_description()
        description = (raw_description + "\n===\n\n") if provide_description and raw_description else ""
-        
+
        if num_fewshot == 0:
            labeled_examples = ""
        else:
@@ -169,10 +176,42 @@ class Dataset(abc.ABC):
                [self.doc_to_text(doc) + self.doc_to_target(doc) for doc in self.fewshot_examples(k=num_fewshot)]
            ) + "\n\n"

-        example = self.doc_to_text(doc).strip()
+        example = self.doc_to_text(doc)
        return description + labeled_examples + example


+class MultipleChoiceTask(Task):
+    def doc_to_target(self, doc):
+        return " " + doc['choices'][doc['gold']]
+
+    def construct_requests(self, doc, ctx):
+        lls = [
+            rf.loglikelihood(ctx, " {}".format(choice))[0]
+            for choice in doc['choices']
+        ]
+
+        return lls
+
+    def process_results(self, doc, results):
+        gold = doc["gold"]
+
+        acc = 1. if np.argmax(results) == gold else 0.
+
+        return {
+            "acc": acc
+        }
+    
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
+    
+    def aggregation(self):
+        return {
+            "acc": mean
+        }
+
+
 def mean(arr):
    return sum(arr) / len(arr)

@@ -193,7 +232,8 @@ def f1_score(items):
    golds = unzipped_list[0]
    preds = unzipped_list[1]
    fscore = sklearn.metrics.f1_score(golds, preds)
-    return max(fscore)
+
+    return np.max(fscore)


 def acc_all(items):
@@ -223,10 +263,70 @@ def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    return max(scores_for_ground_truths)


+def perplexity(items):
+    return math.exp(-mean(items))
+
 req_ret_lens = {
-    'loglikelihood': 2
+    'loglikelihood': 2,
 }

+import os
+import json
+import hashlib
+from sqlitedict import SqliteDict
+
+def hash_args(args):
+    dat = b""
+    for arg in args:
+        assert isinstance(arg, str) or isinstance(arg, int)
+        dat += str(arg).encode()
+        dat += b"\0"
+    return hashlib.sha256(dat).hexdigest()
+
+
+class CachingLM:
+    def __init__(self, lm, cache_db):
+        self.lm = lm
+        self.cache_db = cache_db
+        os.makedirs(os.path.dirname(cache_db), exist_ok=True)
+        self.dbdict = SqliteDict(cache_db, autocommit=True)
+
+    def __getattr__(self, attr):
+        def fn(requests):
+            res = []
+            remaining_reqs = []
+            
+            # figure out which ones are cached and which ones are new
+            for req in requests:
+                hsh = attr + '_' + hash_args(req)
+                if hsh in self.dbdict:
+                    ob = self.dbdict[hsh]
+
+                    assert ob is not None
+
+                    res.append(ob)
+                else:
+                    res.append(None)
+                    remaining_reqs.append(req)
+            
+            # actually run the LM
+            rem_res = getattr(self.lm, attr)(remaining_reqs)
+
+            # stick the new ones back into the list and also cache any of the new ones
+            resptr = 0
+            for req, r in zip(remaining_reqs, rem_res):
+                while res[resptr] is not None: resptr += 1
+
+                res[resptr] = r
+
+                # caching
+                hsh = attr + '_' + hash_args(req)
+                self.dbdict[hsh] = r
+                
+
+            return res
+        return fn
+

 class Request:
    def __init__(self, type, args, index=None):

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
+import collections
+import itertools
+
+
+def evaluate(lm, task_dict, provide_description, num_fewshot, limit):
+    # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
+
+    task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
+
+    results = collections.defaultdict(dict)
+
+    requests = collections.defaultdict(list)
+    requests_origin = collections.defaultdict(list)
+
+    # if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
+    # we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
+    # (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
+
+    # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
+
+    docs = {}
+
+    # get lists of each type of requeste
+    for task_name, task in task_dict_items:
+        #default to validation doc, fall back to test doc if validation unavailable
+        # TODO: the val-fallback-to-test system isn't final, we should revisit it at some point
+        if task.has_validation_docs():
+            task_doc_func = task.validation_docs
+        elif task.has_test_docs():
+            task_doc_func = task.test_docs
+
+        for doc_id, doc in enumerate(itertools.islice(task_doc_func(), 0, limit)):
+            docs[(task_name, doc_id)] = doc
+
+            ctx = task.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+
+            reqs = task.construct_requests(doc, ctx)
+
+            for i, req in enumerate(reqs):
+                requests[req.type].append(req)
+                # i: index in requests for a single task instance
+                # doc_id: unique id that we can get back to a doc using `docs`
+                requests_origin[req.type].append((i, task_name, doc, doc_id))
+
+    # all responses for each (task, doc)
+    process_res_queue = collections.defaultdict(list)
+
+    # execute each type of request
+    for reqtype, reqs in requests.items():
+        # TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
+        # only in index. We could implement some kind of caching, but that would be more of a bandaid
+        # solution. we could also implement some kind of autogrouping here; they should end up next to each other.
+
+        resps = getattr(lm, reqtype)([req.args for req in reqs])
+
+        resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
+
+        for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
+            process_res_queue[(task_name, doc_id)].append((i, resp))
+    
+    vals = collections.defaultdict(list)
+
+    # unpack results and sort back in order and return control to Task
+    for (task_name, doc_id), requests in process_res_queue.items():
+        requests.sort(key=lambda x: x[0])
+        requests = [x[1] for x in requests]
+
+        task = task_dict[task_name]
+        doc = docs[(task_name, doc_id)]
+
+        metrics = task.process_results(doc, requests)
+        for metric, value in metrics.items():
+            vals[(task_name, metric)].append(value)
+    
+    # aggregate results
+    for (task_name, metric), items in vals.items():
+        task = task_dict[task_name]
+        results[task_name][metric] = task.aggregation()[metric](items)
+    
+    return results
\ No newline at end of file
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
 from . import gpt2
 from . import gpt3
+from . import dummy

 MODEL_REGISTRY = {
    "gpt2": gpt2.GPT2LM,
    "gpt3": gpt3.GPT3LM,
+    "dummy": dummy.DummyLM,
 }



--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -12,6 +12,7 @@ class GPT2LM(LM):
        self.gpt2 = transformers.GPT2LMHeadModel.from_pretrained('gpt2').to(self.device)
        self.gpt2.eval()
        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
+        self.tokenizer.pad_token = "<|endoftext|>"

    @classmethod
    def create_from_arg_string(cls, arg_string):

--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import os
 import transformers
 from lm_eval.base import LM
 from lm_eval import utils
+from tqdm import tqdm
+import time
+
+
+def get_result(response, ctxlen):
+    is_greedy = True
+    logprobs = response["logprobs"]["token_logprobs"]
+    continuation_logprobs = sum(logprobs[ctxlen:])
+
+    for i in range(ctxlen, len(response["logprobs"]["tokens"])):
+        token = response["logprobs"]["tokens"][i]
+        top_tokens = response["logprobs"]["top_logprobs"][i]
+        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
+        if top_token != token:
+            is_greedy = False
+            break
+    
+    return continuation_logprobs, is_greedy
+
+
+def oa_completion(**kwargs):
+    import openai
+
+    backoff_time = 3
+    while True:
+        try:
+            return openai.Completion.create(**kwargs)
+        except openai.error.OpenAIError:
+            time.sleep(backoff_time)
+            backoff_time *= 1.5


 class GPT3LM(LM):

    MAX_LENGTH = 2048
+    REQ_CHUNK_SIZE = 64
+    MAX_GEN_TOKS = 256

    def __init__(self, engine, truncate=False):
        """
@@ -21,6 +51,9 @@ class GPT3LM(LM):
        import openai
        self.engine = engine
        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
+
+        # to make the annoying "Using pad_token, but it is not set yet." error go away
+        self.tokenizer.pad_token = "<|endoftext|>"
        self.truncate = truncate

        # Read from environment variable OPENAI_API_SECRET_KEY
@@ -31,23 +64,53 @@ class GPT3LM(LM):
        args = utils.simple_parse_args_string(arg_string)
        return cls(engine=args.get("engine", "davinci"))

-    def loglikelihood(self, context, continuation):
-        # TODO: implement new framework
-        
+    def loglikelihood(self, requests):
        import openai
+        res = []
+
+        for chunk in tqdm(list(utils.chunks(requests, self.REQ_CHUNK_SIZE))):
+            inps = []
+            ctxlens = []
+            for context, continuation in chunk:
+                context_enc = self.tokenizer.encode(context)
+                continuation_enc = self.tokenizer.encode(continuation)
+                inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:]
+                ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH)
+
+                inps.append(inp)
+                ctxlens.append(ctxlen)
+
+            response = oa_completion(
+                engine=self.engine,
+                prompt=inps,
+                echo=True,
+                max_tokens=0, temperature=0.,
+                logprobs=10,
+            )
+
+            for resp, ctxlen in zip(response.choices, ctxlens):
+                res.append(get_result(resp, ctxlen))
+            
+        return res
+
+    def greedy_until(self, requests):
+        import openai
+        res = []
+
+        for context, until in tqdm(requests):
+            context_enc = self.tokenizer.encode(context)
+            inp = context_enc[-(self.MAX_LENGTH - self.MAX_GEN_TOKS):]
+            ctxlen = len(context_enc) - max(0, len(context_enc) - (self.MAX_LENGTH - self.MAX_GEN_TOKS))
+
+            response = oa_completion(
+                engine=self.engine,
+                prompt=[inp],
+                max_tokens=self.MAX_GEN_TOKS, 
+                temperature=0.,
+                logprobs=10,
+            )
+
+            res.append(response.choices[0]['text'])
        
-        context_enc = self.tokenizer.encode(context)
-        continuation_enc = self.tokenizer.encode(continuation)
-        inp = (context_enc + continuation_enc)[-1024:]
-        ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024)
-
-        response = openai.Completion.create(
-            engine=self.engine,
-            prompt=inp,
-            echo=True,
-            max_tokens=0, temperature=0.0,
-            logprobs=0,
-        )
-        logprobs = response.choices[0]["logprobs"]["token_logprobs"]
-        continuation_logprobs = logprobs[ctxlen:]
-        return sum(continuation_logprobs)
+        return res
+
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -18,6 +18,7 @@ from . import race
 from . import piqa
 from . import triviaqa
 from . import pubmedqa
+from . import webqs


 TASK_REGISTRY = {
@@ -37,7 +38,7 @@ TASK_REGISTRY = {
    "cb": superglue.CommitmentBank,
    "copa": superglue.Copa,
    "multirc": superglue.MultiRC,
-    "record": superglue.ReCoRD,
+    #"record": superglue.ReCoRD,
    "wic": superglue.WordsInContext,
    "wsc": superglue.SGWinogradSchemaChallenge,
    
@@ -50,8 +51,8 @@ TASK_REGISTRY = {
    "sciq" : pubmedqa.SciQ,

    #"triviaqa": triviaqa.TriviaQA,
-    # "arc_easy": arc.ARCEasy, # not implemented yet
-    # "arc_challenge": arc.ARCChallenge, # not implemented yet
+    "arc_easy": arc.ARCEasy,
+    "arc_challenge": arc.ARCChallenge,
    # "quac": quac.QuAC, # not implemented yet
    "hellaswag": hellaswag.HellaSwag, # not implemented yet
    # "openbookqa": openbookqa.OpenBookQA, # not implemented yet
@@ -59,9 +60,9 @@ TASK_REGISTRY = {
    # "squad": squad.SQuAD, # not implemented yet
    "race": race.RACE,
    # "naturalqs": naturalqs.NaturalQs, # not implemented yet
-    # "webqs": webqs.WebQs, # not implemented yet
-    # "wsc273": wsc273.WinogradSchemaChallenge273, # not implemented yet
-    # "winogrande": winogrande.Winogrande, # not implemented yet
+    "webqs": webqs.WebQs,
+    "wsc273": wsc273.WinogradSchemaChallenge273,
+    "winogrande": winogrande.Winogrande,
    "anli_r1": anli.ANLIRound1,
    "anli_r2": anli.ANLIRound2,
    "anli_r3": anli.ANLIRound3,

--- a/lm_eval/tasks/arc.py
+++ b/lm_eval/tasks/arc.py
+import numpy as np
+from lm_eval.base import rf, mean
 from . common import HFTask

+
 class ARCEasy(HFTask):
    DATASET_PATH = "ai2_arc"
    DATASET_NAME = "ARC-Easy"

+    letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
+
+    def __init__(self):
+        super().__init__()
+        self.data = self.__clean_data()
+
+    def __clean_data(self):
+        """ Resolves various edge cases in the unprocessed HF ARC dataset. """
+        # NOTE: Some `doc["answerKey"]`s are in numeric string format being one
+        # of {'1', '2', '3', '4', '5'}. We map them back to letters.
+        num_to_letter = {'1': 'A', '2': 'B', '3': 'C', '4': 'D', '5': 'E'}
+        result = {}
+        for split, data in self.data.items():
+            result[split] = []
+            for doc in data:
+                # Ensure all `answerKey`s and `label`s are in letter format.
+                doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
+                doc["choices"]["label"] = [
+                    num_to_letter.get(label, label) for label in doc["choices"]["label"]
+                ]
+                result[split].append(doc)
+        return result
+
    def has_training_docs(self):
        return True

@@ -21,7 +47,8 @@ class ARCEasy(HFTask):
        return "Question: " + doc['question'] + '\nAnswer:'

    def doc_to_target(self, doc):
-        return " " + doc['choices']['text'][doc['choices']['label'].index(doc['answerKey'])]
+        index = self.letter_to_num[doc["answerKey"]]
+        return " " + doc['choices']['text'][index]

    def construct_requests(self, doc, ctx):
        """ Uses RequestFactory to construct Requests and returns an iterable of 
@@ -34,9 +61,11 @@ class ARCEasy(HFTask):
            language description, as well as the few shot examples, and the question
            part of the document for `doc`. 
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-    
+        ll_choices = []
+        for choice in doc["choices"]["text"]:
+            ll_choices.append(rf.loglikelihood(ctx, " " + choice)[0])
+        return ll_choices
+
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a 
        dict where keys are the names of submetrics and values are the values of 
@@ -47,8 +76,11 @@ class ARCEasy(HFTask):
        :param results:
            The results of the requests created in construct_requests.
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        gold = self.letter_to_num[doc["answerKey"]]
+        pred = np.argmax(results)
+        return {
+            "acc": pred == gold
+        }

    def aggregation(self):
        """
@@ -56,8 +88,9 @@ class ARCEasy(HFTask):
            A dictionary where keys are the names of submetrics and values are 
            functions that aggregate a list of metrics
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        return {
+            "acc": mean
+        }

    def higher_is_better(self):
        """
@@ -65,8 +98,10 @@ class ARCEasy(HFTask):
            A dictionary where keys are the names of submetrics and values are 
            whether a higher value of the submetric is better
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        return {
+            "acc": True
+        }
+

 class ARCChallenge(ARCEasy):
    DATASET_PATH = "ai2_arc"

--- a/lm_eval/tasks/arithmetic.py
+++ b/lm_eval/tasks/arithmetic.py
@@ -2,12 +2,12 @@ import abc
 import json
 import os
 from collections import namedtuple
-from lm_eval.base import Dataset, mean, rf
+from lm_eval.base import Task, mean, rf
 from best_download import download_file

 ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])

-class Arithmetic(Dataset):
+class Arithmetic(Task):
    directory = 'data/arithmetic/'

    def __init__(self):
@@ -32,7 +32,7 @@ class Arithmetic(Dataset):
        self._docs = [self.load_doc(json.loads(line)) for line in jsons]

    def has_training_docs(self):
-        return True
+        return False

    def has_validation_docs(self):
        return True
@@ -41,10 +41,10 @@ class Arithmetic(Dataset):
        return False

    def training_docs(self):
-        return self._docs
+        return NotImplemented

    def validation_docs(self):
-        return self._docs[:100]
+        return self._docs

    def test_docs(self):
        return NotImplemented

--- a/lm_eval/tasks/common.py
+++ b/lm_eval/tasks/common.py
 import datasets
 import numpy as np
-import random
-from ..base import Dataset
+from ..base import Task


-class HFTask(Dataset):
+class HFTask(Task):
    DATASET_PATH = None
    DATASET_NAME = None

    def __init__(self):
+        self.data = None
        super().__init__()
-        self._training_docs = None
-    
+
    def download(self):
        self.data = datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)


--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
@@ -2,11 +2,11 @@

 import json
 import random
-from lm_eval.base import Dataset
+from lm_eval.base import Task
 from ..utils import sh


-class CoQA(Dataset):
+class CoQA(Task):
    def __init__(self):
        self.download()
    def download(self):

--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -5,9 +5,9 @@ from sklearn.metrics import f1_score, matthews_corrcoef
 from tqdm import auto as tqdm_lib
 from . common import HFTask, simple_accuracy_metric, yesno
 from pathlib import Path
-from ..base import Dataset
+from ..base import Task

-class DROP(Dataset):
+class DROP(Task):
    DATAFOLDER = Path(__file__).parent / "../../data/drop"
    
    def __init__(self):

--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
@@ -61,7 +61,7 @@ class HellaSwag(HFTask):
            raise ValueError(
                "HellaSwag from HF datasets contained an invalid answer key")
        target = doc['endings'][index]
-        return self.remove_brackets(target)
+        return " " + self.remove_brackets(target)

    def construct_requests(self, doc, ctx):
        """ Uses RequestFactory to construct Requests and returns an iterable of
@@ -75,7 +75,7 @@ class HellaSwag(HFTask):
        """
        ll_answers = []
        for i in range(4):
-            continuation = self.remove_brackets(doc['endings'][i])
+            continuation = " " + self.remove_brackets(doc['endings'][i])
            ll_answers.append(rf.loglikelihood(ctx, continuation))
        return ll_answers


--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
-from lm_eval.base import Dataset, rf, mean
+from lm_eval.base import Task, rf, mean, perplexity
 from lm_eval.utils import sh
 import json
 import math
 from best_download import download_file


-class LAMBADA(Dataset):
+class LAMBADA(Task):
    def download(self):
        sh("mkdir -p data/lambada")
        download_file(
@@ -18,22 +18,22 @@ class LAMBADA(Dataset):
        return False

    def has_validation_docs(self):
-        return False
+        return True

    def has_test_docs(self):
-        return True
+        return False

    def training_docs(self):
        pass

    def validation_docs(self):
-        pass
-
-    def test_docs(self):
        with open("data/lambada/lambada_test.jsonl") as fh:
            for line in fh:
                yield json.loads(line)

+    def test_docs(self):
+        pass
+
    def doc_to_text(self, doc):
        return doc['text'].rsplit(' ', 1)[0]

@@ -45,7 +45,7 @@ class LAMBADA(Dataset):
        return ""

    def construct_requests(self, doc, ctx):
-        ll, is_greedy = rf.loglikelihood(doc, self.doc_to_target(doc))
+        ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))

        return ll, is_greedy
    
@@ -53,13 +53,13 @@ class LAMBADA(Dataset):
        ll, is_greedy = results

        return {
-            'perplexity': math.exp(-ll),
+            'perplexity': ll,
            'accuracy': int(is_greedy)
        }
        
    def aggregation(self):
        return {
-            'perplexity': mean,
+            'perplexity': perplexity,
            'accuracy': mean
        }


--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
@@ -30,10 +30,10 @@ class NaturalQs(HFTask):

    def fewshot_examples(self, k):
        # Data is too large to fit in memory. We just sample from the first bit.
-        if self._traindocs is None:
-            self._traindocs = list(islice(self.training_docs(), 0, 100000))
+        if self._training_docs is None:
+            self._training_docs = list(islice(self.training_docs(), 0, 100000))

-        return random.sample(self._traindocs, k)
+        return random.sample(self._training_docs, k)

    def doc_to_text(self, doc):
        return 'Q: ' + doc['question']['text'] + '\n\n' + 'A: '

--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
 import json
 import random
-from lm_eval.base import Dataset, rf, mean
+from lm_eval.base import Task, rf, mean
 from ..utils import sh
 import os

-class PiQA(Dataset):
+class PiQA(Task):
    def download(self):
        if not os.path.exists('data/piqa'):
            #TODO: use best_download
@@ -46,12 +46,12 @@ class PiQA(Dataset):
        return ""
    
    def doc_to_text(self, doc):
-        return doc[0]['goal']
+        return doc[0]['goal'] + "\n"

    def doc_to_target(self, doc):
        #TODO: check if oa uses newline
        rightanswer = int(doc[1]) + 1
-        return '\n' + ''.join([doc[0]['goal'],' ',doc[0]['sol'+str(rightanswer)]])
+        return ''.join([doc[0]['goal'],' ',doc[0]['sol'+str(rightanswer)]])

    def construct_requests(self, doc, ctx):
        ll_1, _ = rf.loglikelihood(ctx, doc[0]['sol1'])

--- a/lm_eval/tasks/quac.py
+++ b/lm_eval/tasks/quac.py
 import json
 import random
 import os
-from lm_eval.base import Dataset
+from lm_eval.base import Task
 from ..utils import sh


-class QuAC(Dataset):    
+class QuAC(Task):    
    def __init__(self):
        super().__init__()


--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
@@ -3,7 +3,19 @@ import datasets
 import numpy as np
 from lm_eval.base import rf, mean
 from . common import HFTask
-from ..utils_stream import each
+
+import os
+from functools import reduce
+import operator
+from tqdm import tqdm
+import json
+
+class each:
+    def __init__(self, f):
+        self.f = f
+
+    def __rrshift__(self, other):
+        return list(map(self.f, other))


 class RACE(HFTask):

--- a/lm_eval/tasks/sat.py
+++ b/lm_eval/tasks/sat.py
 import json
 import random
 import os
-from lm_eval.base import Dataset, rf, mean
+from lm_eval.base import MultipleChoiceTask, rf, mean
 from tqdm import auto as tqdm_lib
 from . common import simple_accuracy_metric
 import numpy as np
 from ..utils import sh


-class SATAnalogies(Dataset):    
+class SATAnalogies(MultipleChoiceTask):    
    NEEDS_MANUAL_DL = True
    
    def __init__(self):
@@ -61,8 +61,8 @@ class SATAnalogies(Dataset):
            doc = {
                'source': source,
                'query': query.split(' ')[:2],
-                'choices': [c.split(' ')[:2] for c in choices],
-                'answer_key': ['a','b','c','d','e'].index(answer_key.strip()),
+                'choices': ["{} is to {}".format(*c.split(' ')[:2]) for c in choices],
+                'gold': ['a','b','c','d','e'].index(answer_key.strip()),
            }
            yield doc

@@ -72,35 +72,4 @@ class SATAnalogies(Dataset):
        return ""

    def doc_to_text(self, doc):
-        return "{} is to {} as ".format(*doc['query'])
-
-    def doc_to_target(self, doc):
-        return "{} is to {}".format(*doc['choices'][doc['answer_key']])
-
-    def construct_requests(self, doc, ctx):
-        lls = [
-            rf.loglikelihood(ctx, "{} is to {}".format(*doc['choices'][i]))[0]
-            for i in range(5)
-        ]
-
-        return lls
-
-    def process_results(self, doc, results):
-        gold = doc["answer_key"]
-
-        acc = 1. if np.argmax(results) == gold else 0.
-
-        return {
-            "acc": acc
-        }
-    
-    def higher_is_better(self):
-        return {
-            "acc": True
-        }
-    
-    def aggregation(self):
-        return {
-            "acc": mean
-        }
-
+        return "{} is to {} as".format(*doc['query'])
--- a/lm_eval/tasks/storycloze.py
+++ b/lm_eval/tasks/storycloze.py
 import json
 import random
-from lm_eval.base import Dataset
+from lm_eval.base import Task
 from ..utils import sh
 import csv

-class StoryCloze(Dataset):
+class StoryCloze(Task):
    NEEDS_MANUAL_DL = True

    def download(self):