Merge remote-tracking branch 'origin/master' into fazz/refactor-task-coqa

37c3139d · thefazzer · 79c9b68a · 7ad6bf45 · 37c3139d · 37c3139d
Commit 37c3139d authored Feb 13, 2021 by thefazzer
20 changed files
--- a/.gitignore
+++ b/.gitignore
 env
 *.pyc
 data/
+.idea
+lm_cache
\ No newline at end of file
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
 import abc
 import random
 import numpy as np
-import sklearn
-import math
+from lm_eval.metrics import mean
 class LM(abc.ABC):
@@ -15,7 +15,8 @@ class LM(abc.ABC):
        :param requests: list
            A list of pairs (context, continuation)
            context: str
-                Context string
+                Context string. Implementations of LM must be able to handle an 
+                empty context string.
            continuation: str
                The continuation over which log likelihood will be calculated. If 
                there is a word boundary, the space should be in the continuation. 
@@ -29,6 +30,7 @@ class LM(abc.ABC):
        """
        pass
+    # TODO: Add an optional max length
    @abc.abstractmethod
    def greedy_until(self, requests):
        """Generate greedily until a stopping sequence
@@ -37,9 +39,9 @@ class LM(abc.ABC):
            A list of pairs (context, until)
            context: str
                Context string
-            until: str
+            until: [str]
-                The string sequence to generate until. This string sequence may 
+                The string sequences to generate until. These string sequences 
-                span across multiple tokens, or may be part of one token.
+                may each span across multiple tokens, or may be part of one token.
        :return: list
            A list of strings continuation
            continuation: str
@@ -60,6 +62,14 @@ class LM(abc.ABC):
 class Task(abc.ABC):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, and evaluation methods. See BoolQ for a simple example implementation
+    A `doc` can be any python object which represents one instance of evaluation.
+    This is usually a dictionary e.g.
+        {"question": ..., "answer": ...} or
+        {"question": ..., question, answer)
+    """
    def __init__(self):
        self.download()
        self._training_docs = None
@@ -147,9 +157,9 @@ class Task(abc.ABC):
    @abc.abstractmethod
    def aggregation(self):
        """
-        :returns: {str: [float] -> float}
+        :returns: {str: [metric_score] -> float}
            A dictionary where keys are the names of submetrics and values are 
-            functions that aggregate a list of metrics
+            functions that aggregate a list of metric scores
        """
        pass
@@ -212,62 +222,9 @@ class MultipleChoiceTask(Task):
        }
-def mean(arr):
-    return sum(arr) / len(arr)
-def median(arr):
-    return arr[len(arr) // 2]
-def matthews_corrcoef(items):
-    unzipped_list = list(zip(*items))
-    golds = unzipped_list[0]
-    preds = unzipped_list[1]
-    return sklearn.metrics.matthews_corrcoef(golds, preds)
-def f1_score(items):
-    unzipped_list = list(zip(*items))
-    golds = unzipped_list[0]
-    preds = unzipped_list[1]
-    fscore = sklearn.metrics.f1_score(golds, preds)
-    return np.max(fscore)
-def acc_all(items):
-    # Only count as correct if all answers are labeled correctly for each question
-    question_scoring_dict = {}
-    preds = list(zip(*items))[0]
-    docs = list(zip(*items))[1]
-    for doc, pred in zip(docs, preds):
-        question_id = doc["idx"]["question"]
-        if question_id not in question_scoring_dict:
-            question_scoring_dict[question_id] = []
-        gold_label = doc["label"] == 1
-        question_scoring_dict[question_id].append(gold_label == pred)
-    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
-    return acc
-def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
-    """Compute max metric between prediction and each ground truth."""
-    scores_for_ground_truths = []
-    for ground_truth in ground_truths:
-        score = metric_fn(prediction, ground_truth)
-        scores_for_ground_truths.append(score)
-    return max(scores_for_ground_truths)
-def perplexity(items):
-    return math.exp(-mean(items))
 req_ret_lens = {
    'loglikelihood': 2,
+    'greedy_until': None,
 }
 import os
@@ -275,13 +232,9 @@ import json
 import hashlib
 from sqlitedict import SqliteDict
-def hash_args(args):
+def hash_args(attr, args):
-    dat = b""
+    dat = json.dumps([attr] + list(args))
-    for arg in args:
+    return hashlib.sha256(dat.encode('utf-8')).hexdigest()
-        assert isinstance(arg, str) or isinstance(arg, int)
-        dat += str(arg).encode()
-        dat += b"\0"
-    return hashlib.sha256(dat).hexdigest()
 class CachingLM:
@@ -298,7 +251,7 @@ class CachingLM:
            # figure out which ones are cached and which ones are new
            for req in requests:
-                hsh = attr + '_' + hash_args(req)
+                hsh = hash_args(attr, req)
                if hsh in self.dbdict:
                    ob = self.dbdict[hsh]
@@ -320,9 +273,9 @@ class CachingLM:
                res[resptr] = r
                # caching
-                hsh = attr + '_' + hash_args(req)
+                hsh = hash_args(attr, req)
                self.dbdict[hsh] = r
+            self.dbdict.commit()
            return res
        return fn
@@ -338,12 +291,19 @@ class Request:
        self.index = index
    def __iter__(self):
+        if req_ret_lens[self.type] is None:
+            raise IndexError('This request type does not return multiple arguments!')
        i = 0
        for i in range(req_ret_lens[self.type]):
            yield Request(self.type, self.args, i)
    def __getitem__(self, i):
+        if req_ret_lens[self.type] is None:
+            raise IndexError('This request type does not return multiple arguments!')
        return Request(self.type, self.args, i)
+    def __eq__(self, other):
+        return self.type == other.type and self.args == other.args and self.index == other.index
 class RequestFactory:

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -39,6 +39,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit):
            )
            reqs = task.construct_requests(doc, ctx)
+            if not isinstance(reqs, (list, tuple)): reqs = [reqs]
            for i, req in enumerate(reqs):
                requests[req.type].append(req)

--- a/lm_eval/metrics.py
+++ b/lm_eval/metrics.py
+import math
+from pprint import pprint
+import numpy as np
+import sacrebleu
+import sklearn
+def mean(arr):
+    return sum(arr) / len(arr)
+def median(arr):
+    return arr[len(arr) // 2]
+def matthews_corrcoef(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    return sklearn.metrics.matthews_corrcoef(golds, preds)
+def f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = sklearn.metrics.f1_score(golds, preds)
+    return np.max(fscore)
+def acc_all(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+    for doc, pred in zip(docs, preds):
+        question_id = doc["idx"]["question"]
+        if question_id not in question_scoring_dict:
+            question_scoring_dict[question_id] = []
+        gold_label = doc["label"] == 1
+        question_scoring_dict[question_id].append(gold_label == pred)
+    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    """Compute max metric between prediction and each ground truth."""
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+def perplexity(items):
+    return math.exp(-mean(items))
+def bleu(items):
+    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
+    for evaluating a generated sentence to a reference sentence. It counts matching
+    n-grams in the candidate translation to n-grams in the reference text, where
+    1-gram or unigram would be each token and a bigram comparison would be each
+    word pair. The comparison is made regardless of word order
+    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
+    Paper: https://www.aclweb.org/anthology/P02-1040/
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_bleu(preds, refs).score
+def chrf(items):
+    """chrF++ is a tool for automatic evaluation of machine translation output
+    based on character n-gram precision and recall enhanced with word n-grams.
+    Source: https://github.com/m-popovic/chrF
+    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
+    Higher is better  # TODO I think
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_chrf(preds, refs).score
+def ter(items):
+    """Translation Error Rate is an error metric for machine translation that
+    measures the number of edits required to change a system output into one
+    of the references
+    Source: http://www.cs.umd.edu/~snover/tercom/
+    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
+    Lower is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_ter(preds, refs).score
+def _sacreformat(refs, preds):
+    """Format refs and preds for sacrebleu corpus calculation. It is very particular"""
+    # Sacrebleu expects (List[str], List[List[str])
+    #   e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
+    # Note [ref1_stream] is the first reference for each pred.
+    # So lists are size N and (M, N) for N preds and M possible refs for each pred
+    # This is a different order of dimensions that I would expect
+    # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
+    # Must become List[List[str]] with the inner list corresponding to preds
+    if not isinstance(refs, list):
+        refs = list(refs)
+    if not isinstance(refs[0], list):
+        refs = [[ref] for ref in refs]
+    refs = list(zip(*refs))
+    # Note the number of refs in each ref list much match the number of preds
+    # We expect preds to be List[str] or List[List[str]]. Must become List[str]
+    if not isinstance(preds, list):
+        preds = list(preds)
+    if isinstance(preds[0], list):
+        assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}"
+        preds = [pred[0] for pred in preds]
+    return refs, preds
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -19,5 +19,9 @@ class DummyLM(LM):
        return res
    def greedy_until(self, requests):
-        # TODO: implement
+        res = []
-        pass
+        for _ in requests:
+            res.append("lol")
+        return res
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -7,41 +7,75 @@ from tqdm import tqdm
 class GPT2LM(LM):
-    def __init__(self, device="cpu"):
+    MAX_GEN_TOKS = 256
+    def __init__(self, device="cpu", pretrained='gpt2'):
        self.device = torch.device(device)
-        self.gpt2 = transformers.GPT2LMHeadModel.from_pretrained('gpt2').to(self.device)
+        self.gpt2 = transformers.GPT2LMHeadModel.from_pretrained(pretrained).to(self.device)
        self.gpt2.eval()
-        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
+        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained(pretrained)
        self.tokenizer.pad_token = "<|endoftext|>"
    @classmethod
    def create_from_arg_string(cls, arg_string):
        args = utils.simple_parse_args_string(arg_string)
-        return cls(device=args.get("device", "cpu"))
+        return cls(device=args.get("device", "cpu"), pretrained=args.get("pretrained", "gpt2"))
    def loglikelihood(self, requests):
        res = []
-        # TODO: vectorize properly
+        with torch.no_grad():
-        for context, continuation in tqdm(requests):
+            # TODO: vectorize properly
-            # when too long to fit in context, truncate from the left
+            # TODO: automatic batch size detection for vectorization
-            context_enc = self.tokenizer.encode(context)
+            for context, continuation in tqdm(requests):
-            continuation_enc = self.tokenizer.encode(continuation)
+                # when too long to fit in context, truncate from the left
-            inp = torch.tensor([(context_enc + continuation_enc)[-1024:]], dtype=torch.long).to(self.device)
-            ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024)
+                if context == "":
+                    # end of text as context
-            cont_toks = inp[:, ctxlen:]  # [batch, seq]
+                    context_enc = [50256]
-            logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1]  # [batch, seq, vocab]
+                else:
+                    context_enc = self.tokenizer.encode(context)
-            greedy_tokens = logits.argmax(dim=-1)
-            max_equal = (greedy_tokens == cont_toks).all()
+                continuation_enc = self.tokenizer.encode(continuation)
+                inp = torch.tensor([(context_enc + continuation_enc)[-1024:]], dtype=torch.long).to(self.device)
+                ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024)
+                cont_toks = inp[:, ctxlen:]  # [batch, seq]
+                logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1]  # [batch, seq, vocab]
+                greedy_tokens = logits.argmax(dim=-1)
+                max_equal = (greedy_tokens == cont_toks).all()
-            logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [batch, seq]
+                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [batch, seq]
-            res.append((float(logits.sum()), bool(max_equal)))
+                res.append((float(logits.sum()), bool(max_equal)))
        return res
    def greedy_until(self, requests):
-        # TODO: implement
+        # TODO: implement fully general `until` that handles untils that are 
-        pass
+        # multiple tokens or that span multiple tokens correctly
+        res = []
+        for context, until in tqdm(requests):
+            if isinstance(until, str): until = [until]
+            context_enc = torch.tensor([self.tokenizer.encode(context)]).to(self.device)
+            primary_until, = self.tokenizer.encode(until[0])
+            cont = self.gpt2.generate(
+                context_enc,
+                max_length=context_enc.shape[1] + self.MAX_GEN_TOKS,
+                eos_token_id=primary_until,
+                do_sample=False
+            )
+            s = self.tokenizer.decode(cont[0].tolist()[context_enc.shape[1]:])
+            for term in until:
+                s = s.split(term)[0]
+            res.append(s)
+        return res
--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
@@ -72,7 +72,12 @@ class GPT3LM(LM):
            inps = []
            ctxlens = []
            for context, continuation in chunk:
-                context_enc = self.tokenizer.encode(context)
+                if context == "":
+                    # end of text as context
+                    context_enc = [50256]
+                else:
+                    context_enc = self.tokenizer.encode(context)
                continuation_enc = self.tokenizer.encode(continuation)
                inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:]
                ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH)
@@ -108,6 +113,7 @@ class GPT3LM(LM):
                max_tokens=self.MAX_GEN_TOKS, 
                temperature=0.,
                logprobs=10,
+                stop=until
            )
            res.append(response.choices[0]['text'])

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
+from pprint import pprint
 from . import superglue
 from . import glue
 from . import arc
@@ -21,7 +23,10 @@ from . import triviaqa
 from . import pubmedqa
 from . import sciq
 from . import webqs
+from . import qa4mre
+from . import translation
+from . import headqa
+from . import mathqa
 TASK_REGISTRY = {
    # GLUE
@@ -49,19 +54,26 @@ TASK_REGISTRY = {
    "lambada": lambada.LAMBADA,
    "piqa": piqa.PiQA,
+    # Science related
    "pubmedqa" : pubmedqa.Pubmed_QA,
    "sciq" : sciq.SciQ,
+    #"qa4mre" : qa4mre.QA4MRE,
+    "qa4mre_2011" : qa4mre.QA4MRE_2011,
+    "qa4mre_2012" : qa4mre.QA4MRE_2012,
+    "qa4mre_2013" : qa4mre.QA4MRE_2013,
    #"triviaqa": triviaqa.TriviaQA,
    "arc_easy": arc.ARCEasy,
    "arc_challenge": arc.ARCChallenge,
    # "quac": quac.QuAC, # not implemented yet
    "hellaswag": hellaswag.HellaSwag, # not implemented yet
-    # "openbookqa": openbookqa.OpenBookQA, # not implemented yet
+    "openbookqa": openbookqa.OpenBookQA,
    # "sat": sat.SATAnalogies, # not implemented yet
    # "squad": squad.SQuAD, # not implemented yet
    "race": race.RACE,
    # "naturalqs": naturalqs.NaturalQs, # not implemented yet
+    "headqa": headqa.HeadQA,
+    "mathqa": mathqa.MathQA,
    "webqs": webqs.WebQs,
    "wsc273": wsc273.WinogradSchemaChallenge273,
    "winogrande": winogrande.Winogrande,
@@ -80,6 +92,11 @@ TASK_REGISTRY = {
    "arithmetic_2dm": arithmetic.Arithmetic2DMultiplication,
    "arithmetic_1dc": arithmetic.Arithmetic1DComposite,
+    # TODO Perhaps make these groups of tasks
+    #   e.g. anli, arithmetic, openai_translations, harness_translations
+    # e.g. wmt14-fr-en
+    **translation.create_tasks_from_benchmarks(translation.selected_benchmarks)
 }
@@ -87,7 +104,12 @@ ALL_TASKS = sorted(list(TASK_REGISTRY))
 def get_task(task_name):
-    return TASK_REGISTRY[task_name]
+    try:
+        return TASK_REGISTRY[task_name]
+    except KeyError as e:
+        print("Available tasks:")
+        pprint(TASK_REGISTRY)
+        raise KeyError(f"Missing task {task_name}")
 def get_task_dict(task_name_list):

--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
 import numpy as np
-from lm_eval.base import rf, mean
+from lm_eval.base import rf
+from ..metrics import mean
 from . common import HFTask
 class ANLIBase(HFTask):
@@ -39,7 +40,7 @@ class ANLIBase(HFTask):
        # of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly 
        # appended onto the question, with no "Answer:" or even a newline. Do we *really* 
        # want to do it exactly as OA did?
-        return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + '\nTrue, False, or Neither?'
+        return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + ' True, False, or Neither?\nAnswer:'
    def doc_to_target(self, doc):
        # True = entailment

--- a/lm_eval/tasks/arc.py
+++ b/lm_eval/tasks/arc.py
 import numpy as np
-from lm_eval.base import rf, mean
+from lm_eval.base import MultipleChoiceTask
+from ..metrics import mean
 from . common import HFTask
-class ARCEasy(HFTask):
+class ARCEasy(HFTask, MultipleChoiceTask):
    DATASET_PATH = "ai2_arc"
    DATASET_NAME = "ARC-Easy"
-    letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
-    def __init__(self):
-        super().__init__()
-        self.data = self.__clean_data()
-    def __clean_data(self):
-        """ Resolves various edge cases in the unprocessed HF ARC dataset. """
-        # NOTE: Some `doc["answerKey"]`s are in numeric string format being one
-        # of {'1', '2', '3', '4', '5'}. We map them back to letters.
-        num_to_letter = {'1': 'A', '2': 'B', '3': 'C', '4': 'D', '5': 'E'}
-        result = {}
-        for split, data in self.data.items():
-            result[split] = []
-            for doc in data:
-                # Ensure all `answerKey`s and `label`s are in letter format.
-                doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
-                doc["choices"]["label"] = [
-                    num_to_letter.get(label, label) for label in doc["choices"]["label"]
-                ]
-                result[split].append(doc)
-        return result
    def has_training_docs(self):
        return True
@@ -39,68 +17,41 @@ class ARCEasy(HFTask):
    def has_test_docs(self):
        return True
-    def fewshot_description(self):
+    def _convert_standard(self, doc):
-        # TODO: figure out description
+        # NOTE: Some `doc["answerKey"]`s are in numeric string format being one
-        return ""
+        # of {'1', '2', '3', '4', '5'}. We map them back to letters.
+        num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"}
-    def doc_to_text(self, doc):
+        doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
-        return "Question: " + doc['question'] + '\nAnswer:'
+        out_doc = {
+            "id": doc["id"],
-    def doc_to_target(self, doc):
+            "query": "Question: " + doc["question"] + "\nAnswer:",
-        index = self.letter_to_num[doc["answerKey"]]
+            "choices": doc["choices"]["text"],
-        return " " + doc['choices']['text'][index]
+            "gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]),
+        }
+        return out_doc
-    def construct_requests(self, doc, ctx):
+    def _load_docs(self, docs):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        for record in docs:
-        Requests which will be sent to the LM.
+            yield self._convert_standard(record)
-        :param doc:
+    def training_docs(self):
-            The document as returned from training_docs, validation_docs, or test_docs.
+        docs = super().training_docs()
-        :param ctx: str
+        return self._load_docs(docs)
-            The context string, generated by fewshot_context. This includes the natural 
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
-        """
-        ll_choices = []
-        for choice in doc["choices"]["text"]:
-            ll_choices.append(rf.loglikelihood(ctx, " " + choice)[0])
-        return ll_choices
-    def process_results(self, doc, results):
+    def validation_docs(self):
-        """Take a single document and the LM results and evaluates, returning a 
+        docs = super().validation_docs()
-        dict where keys are the names of submetrics and values are the values of 
+        return self._load_docs(docs)
-        the metric for that one document
-        :param doc:
+    def test_docs(self):
-            The document as returned from training_docs, validation_docs, or test_docs.
+        docs = super().test_docs()
-        :param results:
+        return self._load_docs(docs)
-            The results of the requests created in construct_requests.
-        """
-        gold = self.letter_to_num[doc["answerKey"]]
-        pred = np.argmax(results)
-        return {
-            "acc": pred == gold
-        }
-    def aggregation(self):
+    def fewshot_description(self):
-        """
+        # TODO: figure out description
-        :returns: {str: [float] -> float}
+        return ""
-            A dictionary where keys are the names of submetrics and values are 
-            functions that aggregate a list of metrics
-        """
-        return {
-            "acc": mean
-        }
-    def higher_is_better(self):
+    def doc_to_text(self, doc):
-        """
+        return doc["query"]
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
-            whether a higher value of the submetric is better
-        """
-        return {
-            "acc": True
-        }
 class ARCChallenge(ARCEasy):

--- a/lm_eval/tasks/arithmetic.py
+++ b/lm_eval/tasks/arithmetic.py
@@ -2,7 +2,8 @@ import abc
 import json
 import os
 from collections import namedtuple
-from lm_eval.base import Task, mean, rf
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean
 from best_download import download_file
 ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
@@ -56,14 +57,17 @@ class Arithmetic(Task):
        return doc.completion
    def load_doc(self, doc_json):
-        return ArithmeticDoc(context=doc_json['context'], completion=doc_json['completion'])
+        return ArithmeticDoc(context=doc_json['context'].strip()
+            .replace('\n\n', '\n')
+            .replace('Q:', 'Question:')
+            .replace('A:', 'Answer:'), completion=doc_json['completion'])
    def construct_requests(self, doc, ctx):
        ll, is_prediction = rf.loglikelihood(ctx, doc.completion)
        return is_prediction
    def process_results(self, doc, results):
-        ll, is_prediction = results
+        is_prediction, = results
        return {
            "acc": is_prediction
        }

--- a/lm_eval/tasks/common.py
+++ b/lm_eval/tasks/common.py
 import datasets
 import numpy as np
+import lm_eval.metrics
 from ..base import Task
@@ -44,7 +46,7 @@ class HFTask(Task):
 def simple_accuracy_metric(preds, golds):
-    acc = float((np.array(preds) == np.array(golds)).mean())
+    acc = float(lm_eval.metrics.mean())
    return {
        "major": acc,
        "minor": {"acc": acc},

--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
 import numpy as np
-from lm_eval.base import rf, mean, f1_score, matthews_corrcoef
+from lm_eval.base import rf
+from ..metrics import mean, matthews_corrcoef, f1_score
 from scipy.stats import pearsonr, spearmanr
 from tqdm import auto as tqdm_lib
 from . common import HFTask, yesno
+from ..utils import general_detokenize
 # Single-Sentence Tasks
@@ -22,17 +23,18 @@ class CoLA(HFTask):
        return True
    def fewshot_description(self):
-        return "Does this sentence make sense?:\tTrue or False?"
+        # TODO
+        return ""
    def doc_to_text(self, doc):
-        return "Sentence: {}\nAnswer:".format(doc["sentence"])
+        return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])
    def doc_to_target(self, doc):
-        return " {}".format({1: "True", 0: "False"}[doc["label"]])
+        return " {}".format({1: "yes", 0: "no"}[doc["label"]])
    def construct_requests(self, doc, ctx):
-        ll_true, _ = rf.loglikelihood(ctx, " True")
+        ll_true, _ = rf.loglikelihood(ctx, " yes")
-        ll_false, _ = rf.loglikelihood(ctx, " False")
+        ll_false, _ = rf.loglikelihood(ctx, " no")
        return ll_true, ll_false
    def process_results(self, doc, results):
@@ -68,19 +70,19 @@ class SST(HFTask):
        return True
    def fewshot_description(self):
-        return "Indicate if each sentence is Positive or Negative."
+        return "Indicate if the sentiment of each sentence is positive or negative."
    def doc_to_text(self, doc):
-        return "sentence:\t{}\t\nanswer:".format(
+        return "{}\nQuestion: Is this sentence positive or negative?\nAnswer:".format(
-            doc["sentence"],
+            general_detokenize(doc["sentence"]),
        )
    def doc_to_target(self, doc):
-        return " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
+        return " {}".format({1: "positive", 0: "negative"}[doc["label"]])
    def construct_requests(self, doc, ctx):
-        ll_positive, _ = rf.loglikelihood(ctx, " Positive")
+        ll_positive, _ = rf.loglikelihood(ctx, " positive")
-        ll_negative, _ = rf.loglikelihood(ctx, " Negative")
+        ll_negative, _ = rf.loglikelihood(ctx, " negative")
        return ll_positive, ll_negative
    def process_results(self, doc, results):
@@ -127,9 +129,9 @@ class MNLI(HFTask):
            return self.data["test_matched"]
    def doc_to_text(self, doc):
-        return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
+        return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
            doc["premise"],
-            doc["hypothesis"],
+            doc["hypothesis"].strip() + ('' if doc["hypothesis"].strip().endswith('.') else '.'),
        )
    def doc_to_target(self, doc):
@@ -187,7 +189,7 @@ class QNLI(HFTask):
        return True
    def doc_to_text(self, doc):
-        return "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format(
+        return "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
            doc["question"],
            doc["sentence"],
        )
@@ -195,11 +197,11 @@ class QNLI(HFTask):
    def doc_to_target(self, doc):
        # True = entailment
        # False = not entailment
-        return " {}".format({0: "Yes", 1: "No"}[doc["label"]])
+        return " {}".format({0: "yes", 1: "no"}[doc["label"]])
    def construct_requests(self, doc, ctx):
-        ll_yes, _ = rf.loglikelihood(ctx, " Yes")
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
-        ll_no, _ = rf.loglikelihood(ctx, " No")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
        return ll_yes, ll_no
    def process_results(self, doc, results):
@@ -235,7 +237,7 @@ class WNLI(HFTask):
        return True
    def doc_to_text(self, doc):
-        return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
+        return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
@@ -284,7 +286,7 @@ class RTE(HFTask):
        return True
    def doc_to_text(self, doc):
-        return "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
+        return "{}\nQuestion: {} True or False?\nAnswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
@@ -338,9 +340,9 @@ class MRPC(HFTask):
        return "Indicate if both sentences mean the same thing."
    def doc_to_text(self, doc):
-        return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
+        return "Sentence 1: {}\nSentence 2: {}\nQuestion: Do both sentences mean the same thing?\nAnswer:".format(
-            doc["sentence1"],
+            general_detokenize(doc["sentence1"]),
-            doc["sentence2"],
+            general_detokenize(doc["sentence2"]),
        )
    def doc_to_target(self, doc):
@@ -390,7 +392,7 @@ class QQP(HFTask):
        return "Indicate if both questions ask the same thing."
    def doc_to_text(self, doc):
-        return "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
+        return "Question 1: {}\nQuestion 2: {}\nQuestion: Do both questions ask the same thing?\nAnswer:".format(
            doc["question1"],
            doc["question2"],
        )
@@ -443,7 +445,7 @@ class STSB(HFTask):
           "where 5 means identical and 0 means unrelated."
    def doc_to_text(self, doc):
-        return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
+        return "sentence 1: {}\nsentence 2: {}\nAnswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )

--- a/lm_eval/tasks/headqa.py
+++ b/lm_eval/tasks/headqa.py
+from . common import HFTask
+from lm_eval.base import MultipleChoiceTask
+class HeadQA(HFTask, MultipleChoiceTask):
+    DATASET_PATH = "head_qa"
+    DATASET_NAME = None
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def _convert_standard(self, doc):
+        out_doc = {
+            "id": doc["qid"],
+            "query": "Question: " + doc["qtext"] + "\nAnswer:",
+            "choices": [answer["atext"] for answer in doc["answers"]],
+            "gold": int(doc["ra"]) - 1,
+        }
+        return out_doc
+    def _load_docs(self, docs):
+        for doc in docs:
+            yield self._convert_standard(doc)
+    def training_docs(self):
+        docs = super().training_docs()
+        return self._load_docs(docs)
+    def validation_docs(self):
+        docs = super().validation_docs()
+        return self._load_docs(docs)
+    def test_docs(self):
+        docs = super().test_docs()
+        return self._load_docs(docs)
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+    def doc_to_text(self, doc):
+        return doc["query"]
--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
 import re
-import numpy as np
+from lm_eval.base import MultipleChoiceTask
-from ..base import rf, mean
 from . common import HFTask
-class HellaSwag(HFTask):
+class HellaSwag(HFTask, MultipleChoiceTask):
    DATASET_PATH = "hellaswag"
    DATASET_NAME = None
-    @classmethod
-    def remove_brackets(cls, text):
-        """ Removes brackets from HellaSwag documents. 
-        NOTE: The brackets are artifacts of the WikiHow dataset portion underlying
-        HellaSwag.
-        """
-        text = re.sub('\[.*?\]', '', text)
-        return text
    def has_training_docs(self):
        return True
@@ -24,19 +14,37 @@ class HellaSwag(HFTask):
        return True
    def has_test_docs(self):
-        return True
+        return False
+    @classmethod
+    def preprocess(cls, text):
+        text = text.strip()
+        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+        text = text.replace(" [title]", ". ")
+        text = re.sub('\\[.*?\\]', '', text)
+        text = text.replace("  ", " ")
+        return text
+    def _convert_standard(self, doc):
+        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
+        out_doc = {
+            "query": self.preprocess(doc['activity_label'] + ': ' + ctx),
+            "choices": [self.preprocess(ending) for ending in doc['endings']],
+            "gold": int(doc['label']),
+        }
+        return out_doc
+    def _load_docs(self, docs):
+        for record in docs:
+            yield self._convert_standard(record)
    def training_docs(self):
-        if self.has_training_docs():
+        docs = super().training_docs()
-            return self.data["train"]
+        return self._load_docs(docs)
    def validation_docs(self):
-        if self.has_validation_docs():
+        docs = super().validation_docs()
-            return self.data["validation"]
+        return self._load_docs(docs)
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.data["test"]
    def fewshot_description(self):
        return "Label for the relevant action: Sentences describing the " \
@@ -44,73 +52,4 @@ class HellaSwag(HFTask):
            "plausibly completes the situation."
    def doc_to_text(self, doc):
-        text = doc['activity_label'] + ': ' + doc['ctx'] + '\n'
+        return doc["query"]
-        return self.remove_brackets(text)
-    def doc_to_target(self, doc):
-        letter_answer = doc['label']
-        if letter_answer == '0':
-            index = 0
-        elif letter_answer == '1':
-            index = 1
-        elif letter_answer == '2':
-            index = 2
-        elif letter_answer == '3':
-            index = 3
-        else:
-            raise ValueError(
-                "HellaSwag from HF datasets contained an invalid answer key")
-        target = doc['endings'][index]
-        return " " + self.remove_brackets(target)
-    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of
-        Requests which will be sent to the LM.
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`.
-        """
-        ll_answers = []
-        for i in range(4):
-            continuation = " " + self.remove_brackets(doc['endings'][i])
-            ll_answers.append(rf.loglikelihood(ctx, continuation))
-        return ll_answers
-    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a
-        dict where keys are the names of submetrics and values are the values of
-        the metric for that one document
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        gold = int(doc['label'])
-        pred = np.argmax(results)
-        acc = 1. if pred == gold else 0.
-        return {
-            "acc": acc
-        }
-    def aggregation(self):
-        """
-        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are
-            functions that aggregate a list of metrics
-        """
-        return {
-            "acc": mean
-        }
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are
-            whether a higher value of the submetric is better
-        """
-        return {
-            "acc": True
-        }
--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
-from lm_eval.base import Task, rf, mean, perplexity
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean, perplexity
 from lm_eval.utils import sh
 import json
 import math
@@ -9,7 +10,7 @@ class LAMBADA(Task):
    def download(self):
        sh("mkdir -p data/lambada")
        download_file(
-            "https://storage.googleapis.com/gpt-2/data/lambada_test.jsonl", 
+            "http://eaidata.bmk.sh/data/lambada_test.jsonl", 
            "data/lambada/lambada_test.jsonl", 
            "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
        )
@@ -53,18 +54,18 @@ class LAMBADA(Task):
        ll, is_greedy = results
        return {
-            'perplexity': ll,
+            'ppl': ll,
-            'accuracy': int(is_greedy)
+            'acc': int(is_greedy)
        }
    def aggregation(self):
        return {
-            'perplexity': perplexity,
+            'ppl': perplexity,
-            'accuracy': mean
+            'acc': mean
        }
    def higher_is_better(self):
        return {
-            'perplexity': False,
+            'ppl': False,
-            'accuracy': True
+            'acc': True
        }
--- a/lm_eval/tasks/mathqa.py
+++ b/lm_eval/tasks/mathqa.py
+from . common import HFTask
+from lm_eval.base import mean, rf, MultipleChoiceTask
+import re
+class MathQA(HFTask, MultipleChoiceTask):
+    DATASET_PATH = "math_qa"
+    DATASET_NAME = None
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def _convert_standard(self, doc):
+        answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
+        choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
+        out_doc = {
+            "query": "Question: " + doc['Problem'] +"\nAnswer:",
+            "choices": choices,
+            "gold": answer_idx,
+        }
+        return out_doc
+    def _load_docs(self, docs):
+        for record in docs:
+            yield self._convert_standard(record)
+    def training_docs(self):
+        docs = super().training_docs()
+        return self._load_docs(docs)
+    def validation_docs(self):
+        docs = super().validation_docs()
+        return self._load_docs(docs)
+    def test_docs(self):
+        docs = super().test_docs()
+        return self._load_docs(docs)
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+    def doc_to_text(self, doc):
+        return doc["query"]
--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
-import numpy as np
+from lm_eval.base import MultipleChoiceTask
-from scipy.stats import pearsonr, spearmanr
+from .common import HFTask
-from sklearn.metrics import f1_score, matthews_corrcoef
-from tqdm import auto as tqdm_lib
-from . common import HFTask, simple_accuracy_metric, yesno
-class OpenBookQA(HFTask):
+class OpenBookQA(HFTask, MultipleChoiceTask):
    DATASET_PATH = "openbookqa"
    DATASET_NAME = "main"
@@ -17,82 +15,34 @@ class OpenBookQA(HFTask):
    def has_test_docs(self):
        return True
+    def _convert_standard(self, doc):
+        out_doc = {
+            "id": doc["id"],
+            "query": doc["question_stem"],
+            "choices": doc["choices"]["text"],
+            "gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()),
+        }
+        return out_doc
+    def _load_docs(self, docs):
+        for record in docs:
+            yield self._convert_standard(record)
    def training_docs(self):
-        if self.has_training_docs():
+        docs = super().training_docs()
-            if self._training_docs is None:
+        return self._load_docs(docs)
-                self._training_docs = list(self.data["train"])
-            return self._training_docs
    def validation_docs(self):
-        if self.has_validation_docs():
+        docs = super().validation_docs()
-            return self.data["validation"]
+        return self._load_docs(docs)
    def test_docs(self):
-        if self.has_test_docs():
+        docs = super().test_docs()
-            return self.data["test"]
+        return self._load_docs(docs)
    def fewshot_description(self):
        # TODO: figure out fewshot description
        return ""
    def doc_to_text(self, doc):
-        return doc['question_stem'] + '\n'
+        return doc["query"]
-    def doc_to_target(self, doc):
-        letter_answer = doc['answerKey']
-        if letter_answer == 'A':
-            index = 0
-        elif letter_answer == 'B':
-            index = 1
-        elif letter_answer == 'C':
-            index = 2
-        elif letter_answer == 'D':
-            index = 3
-        else:
-            raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
-        return doc['choices']['text'][index] + '.'
-    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
-        Requests which will be sent to the LM.
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
-        the metric for that one document
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-    def aggregation(self):
-        """
-        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are 
-            functions that aggregate a list of metrics
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
-            whether a higher value of the submetric is better
-        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
 import numpy as np
-from lm_eval.base import rf, mean
+from lm_eval.base import rf
+from ..metrics import mean
 from . common import HFTask
@@ -21,15 +22,15 @@ class PiQA(HFTask):
        return ""
    def doc_to_text(self, doc):
-        return doc["goal"] + "\n"
+        return "Question: "+doc["goal"] + "\nAnswer:"
    def doc_to_target(self, doc):
        solutions = [doc["sol1"], doc["sol2"]]
-        return solutions[doc["label"]]
+        return " " + solutions[doc["label"]]
    def construct_requests(self, doc, ctx):
-        ll_1, _ = rf.loglikelihood(ctx, doc['sol1'])
+        ll_1, _ = rf.loglikelihood(ctx, " " + doc['sol1'])
-        ll_2, _ = rf.loglikelihood(ctx, doc['sol2'])
+        ll_2, _ = rf.loglikelihood(ctx, " " + doc['sol2'])
        return ll_1, ll_2
    def process_results(self, doc, results):

--- a/lm_eval/tasks/pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa.py
@@ -2,7 +2,8 @@ import numpy as np
 import json
 import random
 from .common import HFTask 
-from lm_eval.base import rf, mean
+from lm_eval.base import rf
+from ..metrics import mean
 class Pubmed_QA(HFTask):
@@ -30,7 +31,7 @@ class Pubmed_QA(HFTask):
    def doc_to_text(self, doc):
        ctxs = "\n".join(doc["context"]["contexts"])
-        return "abstract: {}\nquestion: {}\nanswer:".format(
+        return "Abstract: {}\nQuestion: {}\nAnswer:".format(
            ctxs,
            doc["question"],
            doc["final_decision"]