Merge branch 'master' into multilingual

31ebb599 · Stella Biderman · GitHub · 38c04a0f · 8728710c · 31ebb599
Unverified Commit 31ebb599 authored Jan 05, 2022 by Stella Biderman Committed by GitHub Jan 05, 2022
20 changed files
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -23,11 +23,11 @@ jobs:
        path: |
          ~/.cache
        # An explicit key for restoring and saving the cache
-        key: evaldata-cache-3
+        key: evaldata-cache-4
    - name: Set up Python 3.9
      uses: actions/setup-python@v2
      with:
-        python-version: 3.9
+        python-version: 3.9.7
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
@@ -42,7 +42,7 @@ jobs:
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
    - name: Test with pytest
      run: |
-        pytest --cov=lm_eval/ tests/
+        pytest -vv --cov=lm_eval/ tests/
    - name: Upload to codecov
      run: |
        bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
--- a/CITATION.bib
+++ b/CITATION.bib
+@software{eval-harness,
+  author       = {Gao, Leo and
+                  Tow, Jonathan and
+                  Biderman, Stella and
+                  Black, Sid and
+                  DiPofi, Anthony and
+                  Foster, Charles and
+                  Golding, Laurence and
+                  Hsu, Jeffrey and
+                  McDonell, Kyle and
+                  Muennighoff, Niklas and
+                  Phang, Jason and
+                  Reynolds, Laria and
+                  Tang, Eric and
+                  Thite, Anish and
+                  Wang, Ben and
+                  Wang, Kevin and
+                  Zou, Andy},
+  title        = {A framework for few-shot language model evaluation},
+  month        = sep,
+  year         = 2021,
+  publisher    = {Zenodo},
+  version      = {v0.0.1},
+  doi          = {10.5281/zenodo.5371628},
+  url          = {https://doi.org/10.5281/zenodo.5371628}
+}
--- a/README.md
+++ b/README.md
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
 import abc
-import random
+from typing import Iterable
 import numpy as np
 import re
+import os
+import json
+import hashlib
+from sqlitedict import SqliteDict
+from tqdm import tqdm
+import torch
+import torch.nn.functional as F

-from lm_eval.metrics import mean, perplexity, weighted_perplexity, weighted_mean
+from lm_eval.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
+from lm_eval import utils
+from abc import abstractmethod


 class LM(abc.ABC):
    def __init__(self):
        self.cache_hook = CacheHook(None)

-    @abc.abstractmethod
+    @abstractmethod
    def loglikelihood(self, requests):
        """Compute log-likelihood of generating a continuation from a context.
        Downstream tasks should attempt to use loglikelihood instead of other 
@@ -34,7 +43,7 @@ class LM(abc.ABC):
        """
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def loglikelihood_rolling(self, requests):
        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
        - We will use the full max context length of the model.
@@ -77,7 +86,7 @@ class LM(abc.ABC):
        pass

    # TODO: Add an optional max length
-    @abc.abstractmethod
+    @abstractmethod
    def greedy_until(self, requests):
        """Generate greedily until a stopping sequence

@@ -96,18 +105,235 @@ class LM(abc.ABC):
        pass

    @classmethod
-    def create_from_arg_string(cls, arg_string):
-        """Constructor method, in case models need additional arguments
-        e.g. OpenAI API engine, paths for loading, other params
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
+
+    def set_cache_hook(self, cache_hook):
+        self.cache_hook = cache_hook
+
+
+class BaseLM(LM):
+
+    @property
+    @abstractmethod
+    def eot_token_id(self):
+        pass
+
+    @property
+    @abstractmethod
+    def max_length(self):
+        pass
+
+    @property
+    @abstractmethod
+    def max_gen_toks(self):
+        pass
+
+    @property
+    @abstractmethod
+    def batch_size(self):
+        pass
+
+    @property
+    @abstractmethod
+    def device(self):
+        pass
+
+    @abstractmethod
+    def tok_encode(self, string: str): pass
+    
+    @abstractmethod
+    def tok_decode(self, tokens: Iterable[int]): pass

-        :param arg_string: str
-            Left up to individual model class to handle
+    @abstractmethod
+    def _model_generate(self, context, max_length, eos_token_id): pass

+    @abstractmethod
+    def _model_call(self, inps):
        """
-        return cls()
+        inps: a torch tensor of shape [batch, sequence]
+        the size of sequence may vary from call to call

-    def set_cache_hook(self, cache_hook):
-        self.cache_hook = cache_hook
+        returns: a torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model
+        """
+        pass
+
+    # subclass must implement properties vocab_size, eot_token_id, max_gen_toks, batch_size, device, max_length.
+    # TODO: enforce this somehow
+
+    def loglikelihood(self, requests):
+        new_reqs = []
+        for context, continuation in requests:
+            if context == "":
+                # end of text as context
+                context_enc = [self.eot_token_id]
+            else:
+                context_enc = self.tok_encode(context)
+
+            continuation_enc = self.tok_encode(continuation)
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+
+        return self._loglikelihood_tokens(new_reqs)
+
+    def loglikelihood_rolling(self, requests):
+        # TODO: Implement caching once we've confirmed the perplexity implementation
+        # TODO: automatic batch size detection for vectorization
+
+        loglikelihoods = []
+        for string, in tqdm(requests):
+            rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
+                token_list=self.tok_encode(string),
+                prefix_token=self.eot_token_id,
+                max_seq_len=self.max_length,
+                context_len=1,
+            )))
+
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
+            # that
+            string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
+            
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+            
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+
+        return loglikelihoods
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+        
+        # TODO: automatic (variable) batch size detection for vectorization
+        reord = utils.Reorderer(requests, _collate)
+        for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            padding_length = None
+
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works:
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # gpt2    \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                inp = torch.tensor(
+                    (context_enc + continuation_enc)[-(self.max_length+1):][:-1],
+                    dtype=torch.long
+                ).to(self.device)
+                inplen, = inp.shape
+
+                cont = continuation_enc
+
+                # since in _collate we make sure length is descending, the longest is always the first one.
+                padding_length = padding_length if padding_length is not None else inplen
+
+                # pad length from seq to padding_length
+                inp = torch.cat([
+                    inp,  # [seq]
+                    torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device)  # [padding_length - seq]
+                ], dim=0)
+
+                inps.append(inp.unsqueeze(0))  # [1, padding_length]
+                cont_toks_list.append(cont)
+                inplens.append(inplen)
+
+            batched_inps = torch.cat(inps, dim=0)  # [batch, padding_length
+            multi_logits = F.log_softmax(self._model_call(batched_inps), dim=-1).cpu()  # [batch, padding_length, vocab]
+
+            for (cache_key, _, _), logits, inp, inplen, cont_toks \
+                    in zip(chunk, multi_logits, inps, inplens, cont_toks_list):
+
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                logits = logits[inplen-contlen:inplen].unsqueeze(0)  # [1, seq, vocab]
+
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+                cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)  # [1, seq]
+                max_equal = (greedy_tokens == cont_toks).all()
+
+                # Obtain log-probs at the corresponding continuation token indices
+                # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)  # [1, seq]
+
+                # Answer: (log prob, is-exact-match)
+                answer = (float(logits.sum()), bool(max_equal))
+
+                # partial caching
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+
+                res.append(answer)
+
+        return reord.get_original(res)
+    
+    def greedy_until(self, requests):
+        # TODO: implement fully general `until` that handles untils that are 
+        #       multiple tokens or that span multiple tokens correctly
+
+        # TODO: extract to TokenizedLM?
+        res = []
+
+        def _collate(x):
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
+        
+        reord = utils.Reorderer(requests, _collate)
+
+        for context, until in tqdm(reord.get_reordered()):
+            if isinstance(until, str):
+                until = [until]
+
+            primary_until, = self.tok_encode(until[0])
+            
+            context_enc = torch.tensor([self.tok_encode(context)[self.max_gen_toks - self.max_length:]]).to(self.device)
+
+            cont = self._model_generate(context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until)
+
+            s = self.tok_decode(cont[0].tolist()[context_enc.shape[1]:])
+
+            for term in until:
+                s = s.split(term)[0]
+            
+            # partial caching
+            self.cache_hook.add_partial("greedy_until", (context, until), s)
+            
+            res.append(s)
+        
+        return reord.get_original(res)


 class Task(abc.ABC):
@@ -128,17 +354,17 @@ class Task(abc.ABC):
        """Downloads the task dataset if necessary"""
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def has_training_docs(self):
        """Whether the task has a training set"""
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def has_validation_docs(self):
        """Whether the task has a validation set"""
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def has_test_docs(self):
        """Whether the task has a test set"""
        pass
@@ -170,15 +396,15 @@ class Task(abc.ABC):

        return rnd.sample(self._training_docs, k)

-    @abc.abstractmethod
+    @abstractmethod
    def doc_to_text(self, doc):
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def doc_to_target(self, doc):
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def construct_requests(self, doc, ctx):
        """ Uses RequestFactory to construct Requests and returns an iterable of 
        Requests which will be sent to the LM.
@@ -192,7 +418,7 @@ class Task(abc.ABC):
        """
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a 
        dict where keys are the names of submetrics and values are the values of 
@@ -205,7 +431,7 @@ class Task(abc.ABC):
        """
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def aggregation(self):
        """
        :returns: {str: [metric_score] -> float}
@@ -214,7 +440,7 @@ class Task(abc.ABC):
        """
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def higher_is_better(self):
        """
        :returns: {str: bool}
@@ -238,7 +464,9 @@ class Task(abc.ABC):
                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
            else:
                if self._fewshot_docs is None:
-                    self._fewshot_docs = list(self.validation_docs() if self.has_validation_docs() else self.test_docs())
+                    self._fewshot_docs = list(
+                        self.validation_docs() if self.has_validation_docs() else self.test_docs()
+                    )

                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)

@@ -253,7 +481,7 @@ class Task(abc.ABC):
        return description + labeled_examples + example


-class MultipleChoiceTask(Task):
+class MultipleChoiceTask(Task, abc.ABC):
    def doc_to_target(self, doc):
        return " " + doc['choices'][doc['gold']]

@@ -328,39 +556,30 @@ class PerplexityTask(Task, abc.ABC):
    def process_results(self, doc, results):
        loglikelihood, = results
        words = self.count_words(doc)
-        bytes = self.count_bytes(doc)
+        bytes_ = self.count_bytes(doc)
        return {
            "word_perplexity": (loglikelihood, words),
-            "byte_perplexity": (loglikelihood, bytes),
-            "bits_per_byte": (-loglikelihood, self.count_bytes(doc))
+            "byte_perplexity": (loglikelihood, bytes_),
+            "bits_per_byte": (loglikelihood, bytes_),
        }

    def aggregation(self):
        return {
            "word_perplexity": weighted_perplexity,
            "byte_perplexity": weighted_perplexity,
-            "bits_per_byte": weighted_mean
+            "bits_per_byte": bits_per_byte,
        }

-    def count_bytes(self, doc):
+    @classmethod
+    def count_bytes(cls, doc):
        return len(doc.encode("utf-8"))
-    
-    def count_words(self, doc):
+
+    @classmethod
+    def count_words(cls, doc):
        """ Downstream tasks with custom word boundaries should override this! """
        return len(re.split(r"\s+", doc))


-req_ret_lens = {
-    'loglikelihood': 2,
-    'greedy_until': None,
-    'loglikelihood_rolling': None,
-}
-
-import os
-import json
-import hashlib
-from sqlitedict import SqliteDict
-
 def hash_args(attr, args):
    dat = json.dumps([attr] + list(args))
    return hashlib.sha256(dat.encode('utf-8')).hexdigest()
@@ -383,9 +602,17 @@ class CacheHook:

 class CachingLM:
    def __init__(self, lm, cache_db):
+        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
+
+        :param lm: LM
+            Underlying LM
+        :param cache_db: str
+            Path to cache db
+        """
        self.lm = lm
        self.cache_db = cache_db
-        if os.path.dirname(cache_db): os.makedirs(os.path.dirname(cache_db), exist_ok=True)
+        if os.path.dirname(cache_db):
+            os.makedirs(os.path.dirname(cache_db), exist_ok=True)
        self.dbdict = SqliteDict(cache_db, autocommit=True)

        # add hook to lm
@@ -409,13 +636,14 @@ class CachingLM:
                    res.append(None)
                    remaining_reqs.append(req)
            
-            # actually run the LM
+            # actually run the LM on the requests that do not have cached results
            rem_res = getattr(self.lm, attr)(remaining_reqs)

            # stick the new ones back into the list and also cache any of the new ones
            resptr = 0
            for req, r in zip(remaining_reqs, rem_res):
-                while res[resptr] is not None: resptr += 1
+                while res[resptr] is not None:
+                    resptr += 1

                res[resptr] = r

@@ -431,32 +659,39 @@ class CachingLM:
        return CacheHook(self)


+REQUEST_RETURN_LENGTHS = {
+    'loglikelihood': 2,
+    'greedy_until': None,
+    'loglikelihood_rolling': None,
+}
+
+
 class Request:
-    def __init__(self, type, args, index=None):
-        if type not in req_ret_lens.keys():
-            raise NotImplementedError('The request type {} is not implemented!'.format(type))
+    def __init__(self, request_type, args, index=None):
+        if request_type not in REQUEST_RETURN_LENGTHS.keys():
+            raise NotImplementedError('The request type {} is not implemented!'.format(request_type))

-        self.type = type
+        self.request_type = request_type
        self.args = args
        self.index = index
    
    def __iter__(self):
-        if req_ret_lens[self.type] is None:
+        if REQUEST_RETURN_LENGTHS[self.request_type] is None:
            raise IndexError('This request type does not return multiple arguments!')
-        i = 0
-        for i in range(req_ret_lens[self.type]):
-            yield Request(self.type, self.args, i)
+        for i in range(REQUEST_RETURN_LENGTHS[self.request_type]):
+            yield Request(self.request_type, self.args, i)
    
    def __getitem__(self, i):
-        if req_ret_lens[self.type] is None:
+        if REQUEST_RETURN_LENGTHS[self.request_type] is None:
            raise IndexError('This request type does not return multiple arguments!')
-        return Request(self.type, self.args, i)
+        return Request(self.request_type, self.args, i)
    
    def __eq__(self, other):
-        return self.type == other.type and self.args == other.args and self.index == other.index
+        return self.request_type == other.request_type and self.args == other.args and self.index == other.index

    def __repr__(self):
-        return f"Req_{self.type}{self.args}[{self.index}]\n"
+        return f"Req_{self.request_type}{self.args}[{self.index}]\n"
+

 class RequestFactory:
    def __getattr__(self, attr):

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -2,12 +2,96 @@ import collections
 import itertools
 import random
 import lm_eval.metrics
+import lm_eval.models
+import lm_eval.tasks
+import lm_eval.base
+import numpy as np
+
+
+def simple_evaluate(model, model_args, task_names,
+                    num_fewshot=0, batch_size=None, device=None,
+                    no_cache=False, limit=None, bootstrap_iters=100000):
+    """Instantiate and evaluate a model on a list of tasks.
+
+    :param model: str
+        Name of model, see lm_eval.models.get_model
+    :param model_args: str
+        String arguments for each model class, see LM.create_from_arg_string
+    :param task_names: list[str]
+        List of task names
+    :param num_fewshot: int
+        Number of examples in few-shot context
+    :param batch_size: int, optional
+        Batch size for model
+    :param device: str, optional
+        PyTorch device (e.g. "cpu" or "cuda:0") for running models
+    :param no_cache: bool
+        Whether or not to cache
+    :param limit: int, optional
+        Limit the number of examples per task (only use this for testing)
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics
+    :return
+        Dictionary of results
+    """
+    random.seed(1234)
+    np.random.seed(1234)
+
+    lm = lm_eval.models.get_model(model).create_from_arg_string(model_args, {
+        'batch_size': batch_size, 'device': device
+    })
+
+    if not no_cache:
+        lm = lm_eval.base.CachingLM(
+            lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db'
+        )
+    
+    task_dict = lm_eval.tasks.get_task_dict(task_names)
+    results = evaluate(lm, task_dict, False, num_fewshot, limit)
+
+    # add info about the model and few shot config
+    results["config"] = {
+        "model": model,
+        "model_args": model_args,
+        "num_fewshot": num_fewshot,
+        "batch_size": batch_size,
+        "device": device,
+        "no_cache": no_cache,
+        "limit": limit,
+        "bootstrap_iters": bootstrap_iters
+    }
+
+    return results


 def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000):
+    """Instantiate and evaluate a model on a list of tasks.
+
+    :param lm: obj
+        Language Model
+    :param task_dict: dict[str, Task]
+        Dictionary of tasks
+    :param provide_description: bool
+        Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
+    :param num_fewshot: int
+        Number of examples in few-shot context
+    :param limit: int, optional
+        Limit the number of examples per task (only use this for testing)
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics
+    :return
+        Dictionary of results
+    """
    # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces

-    task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
+    # TODO: todo: implement proper description-providing system
+    assert not provide_description  # not implemented.
+
+    task_dict_items = [
+        (name, task)
+        for name, task in task_dict.items()
+        if(task.has_validation_docs() or task.has_test_docs())
+    ]

    results = collections.defaultdict(dict)
    versions = collections.defaultdict(dict)
@@ -15,23 +99,25 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
    requests = collections.defaultdict(list)
    requests_origin = collections.defaultdict(list)

-    # if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
-    # we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
-    # (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
+    # If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger
+    # memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because
+    # over-engineering is bad (or we could make it write the requests to disk and then read them back out again
+    #  - probably using an sqlite db because of all the moving parts we have

    # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
-
    docs = {}

-    # get lists of each type of requeste
+    # get lists of each type of request
    for task_name, task in task_dict_items:
        versions[task_name] = task.VERSION
-        #default to test doc, fall back to val doc if validation unavailable
+        # default to test doc, fall back to val doc if validation unavailable
        # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
        if task.has_test_docs():
            task_doc_func = task.test_docs
        elif task.has_validation_docs():
            task_doc_func = task.validation_docs
+        else:
+            raise RuntimeError("Task has neither test_docs nor validation_docs")

        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
        task_docs = list(task_doc_func())
@@ -50,25 +136,26 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
            )

            reqs = task.construct_requests(doc, ctx)
-            if not isinstance(reqs, (list, tuple)): reqs = [reqs]
+            if not isinstance(reqs, (list, tuple)):
+                reqs = [reqs]
            for i, req in enumerate(reqs):
-                requests[req.type].append(req)
+                requests[req.request_type].append(req)
                # i: index in requests for a single task instance
                # doc_id: unique id that we can get back to a doc using `docs`
-                requests_origin[req.type].append((i, task_name, doc, doc_id))
+                requests_origin[req.request_type].append((i, task_name, doc, doc_id))

    # all responses for each (task, doc)
    process_res_queue = collections.defaultdict(list)

    # execute each type of request
    for reqtype, reqs in requests.items():
-        # TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
-        # only in index. We could implement some kind of caching, but that would be more of a bandaid
-        # solution. we could also implement some kind of autogrouping here; they should end up next to each other.
+        # TODO: right now, this code runs multiple separate LM requests for multiple Requests differing
+        #       only in index. We could implement some kind of caching, but that would be more of a band-aid
+        #       solution. we could also implement some kind of auto-grouping here;
+        #       they should end up next to each other.

        print("Running", reqtype, "requests")
        resps = getattr(lm, reqtype)([req.args for req in reqs])
-
        resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]

        for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
@@ -93,11 +180,49 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
        task = task_dict[task_name]
        results[task_name][metric] = task.aggregation()[metric](items)

-        stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=bootstrap_iters)
+        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
+        # so we run them less iterations. still looking for a cleaner way to do this
+        stderr = lm_eval.metrics.stderr_for_metric(
+            metric=task.aggregation()[metric],
+            bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
+        )
        if stderr is not None:
            results[task_name][metric + "_stderr"] = stderr(items)
    
    return {
-        "results": results,
-        "versions": versions
+        "results": dict(results),
+        "versions": dict(versions)
    }
+
+
+def make_table(result_dict):
+    """Generate table of results."""
+    from pytablewriter import MarkdownTableWriter, LatexTableWriter
+
+    md_writer = MarkdownTableWriter()
+    latex_writer = LatexTableWriter()
+    md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
+    latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
+
+    values = []
+
+    for k, dic in result_dict["results"].items():
+        version = result_dict["versions"][k]
+        for m, v in dic.items():
+            if m.endswith("_stderr"):
+                continue
+
+            if m + "_stderr" in dic:
+                se = dic[m + "_stderr"]
+                values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
+            else:
+                values.append([k, version, m, '%.4f' % v, '', ''])
+            k = ""
+            version = ""
+    md_writer.value_matrix = values
+    latex_writer.value_matrix = values
+
+    # todo: make latex table look good
+    # print(latex_writer.dumps())
+
+    return md_writer.dumps()
--- a/lm_eval/metrics.py
+++ b/lm_eval/metrics.py
 import math
-from collections import Iterable
-from pprint import pprint
+from collections.abc import Iterable

 import numpy as np
 import sacrebleu
@@ -63,6 +62,7 @@ def acc_all(items):
    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
    return acc

+
 def acc_all_stderr(items):
    # Only count as correct if all answers are labeled correctly for each question
    question_scoring_dict = {}
@@ -98,9 +98,13 @@ def weighted_mean(items):
    a, b = zip(*items)
    return sum(a) / sum(b)

+
 def weighted_perplexity(items):
    return math.exp(-weighted_mean(items))

+def bits_per_byte(items):
+    return -weighted_mean(items) / math.log(2)
+

 def bleu(items):
    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
@@ -179,12 +183,13 @@ def _sacreformat(refs, preds):

    return refs, preds

-## stderr stuff
+# stderr stuff

 class _bootstrap_internal:
    def __init__(self, f, n):
        self.f = f
        self.n = n
+
    def __call__(self, v):
        i, xs = v
        rnd = random.Random()
@@ -208,7 +213,9 @@ def bootstrap_stderr(f, xs, iters):
    chunk_size = min(1000, iters)
    from tqdm import tqdm
    print("bootstrapping for stddev:", f.__name__)
-    for bootstrap in tqdm(pool.imap(_bootstrap_internal(f, chunk_size), [(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
+    for bootstrap in tqdm(pool.imap(
+            _bootstrap_internal(f, chunk_size),
+            [(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
        # sample w replacement
        res.extend(bootstrap)


--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -3,6 +3,7 @@ from . import gpt3
 from . import dummy

 MODEL_REGISTRY = {
+    "hf": gpt2.HFLM,
    "gpt2": gpt2.GPT2LM,
    "gpt3": gpt3.GPT3LM,
    "dummy": dummy.DummyLM,

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
 import transformers
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from lm_eval.base import LM
-from lm_eval import utils
-from tqdm import tqdm
-import numpy as np
+from lm_eval.base import BaseLM


-class GPT2LM(LM):
-    MAX_GEN_TOKS = 256
-    VOCAB_SIZE = 50257
-    EOT_TOKEN_ID = 50256
+class HFLM(BaseLM):

-    def __init__(self, device='cuda', pretrained='gpt2', batch_size=1):
+    def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
        super().__init__()
+
+        assert isinstance(device, str)
+        assert isinstance(pretrained, str)
+        assert isinstance(batch_size, int)
+
        if device:
-            self.device = torch.device(device)
+            self._device = torch.device(device)
        else:
-            self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(pretrained).to(self.device)
+            self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+        # TODO: update this to be less of a hack once subfolder is fixed in HF
+        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
+            pretrained, revision=revision + ("/" + subfolder if subfolder is not None else "")
+        ).to(self.device)
        self.gpt2.eval()

-        # pretrained tokenizer for neo is broken for now so just hardcoding this to gpt2
-        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
-        self.tokenizer.pad_token = "<|endoftext|>"
-        try:
-            self.max_length = self.gpt2.config.n_ctx
-        except AttributeError:
-            # gptneoconfig doesn't have n_ctx apparantly
-            self.max_length = self.gpt2.config.max_position_embeddings
+        # pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)

-        assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373]
+        assert isinstance(self.tokenizer, (
+            transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
+            transformers.T5Tokenizer, transformers.T5TokenizerFast,
+        )), "this tokenizer has not been checked for compatibility yet!"

-        # multithreading and batching
-        gpus = torch.cuda.device_count()
-        batch_size_per_gpu = batch_size # todo: adaptive batch size
+        self.vocab_size = self.tokenizer.vocab_size

-        self.batch_size = batch_size_per_gpu * gpus
+        if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
+            assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373], \
+                self.tokenizer.encode('hello\n\nhello')
+
+        # multithreading and batching
+        self.batch_size_per_gpu = batch_size  # todo: adaptive batch size

        # TODO: fix multi-gpu
+        # gpus = torch.cuda.device_count()
        # if gpus > 1:
        #     self.gpt2 = nn.DataParallel(self.gpt2)

-    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config={}):
-        args = utils.simple_parse_args_string(arg_string)
-        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        return cls(**args, **args2)
-
-    def loglikelihood(self, requests):
-        new_reqs = []
-        for context, continuation in requests:
-            if context == "":
-                # end of text as context
-                context_enc = [self.EOT_TOKEN_ID]
-            else:
-                context_enc = self.tokenizer.encode(context)
-
-            continuation_enc = self.tokenizer.encode(continuation)
-
-            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-
-        return self._loglikelihood_tokens(new_reqs)
-
-    def loglikelihood_rolling(self, requests):
-        # TODO: Implement caching once we've confirmed the perplexity implementation
-        # TODO: automatic batch size detection for vectorization
-
-        loglikelihoods = []
-        with torch.no_grad():
-            for string, in tqdm(requests):
-                rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
-                    token_list=self.tokenizer.encode(string),
-                    prefix_token=self.EOT_TOKEN_ID,
-                    max_seq_len=self.max_length,
-                    context_len=1,
-                )))
-
-                rolling_token_windows = [(None,) + x for x in rolling_token_windows]
-
-                # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
-                string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
-                
-                # discard is_greedy
-                string_nll = [x[0] for x in string_nll]
-                
-                string_nll = sum(string_nll)
-                loglikelihoods.append(string_nll)
-
-        return loglikelihoods
-
-    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
-        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
-        res = []
-        with torch.no_grad():
-
-            def _collate(x):
-                # the negative sign on len(toks) sorts descending - this has a few advantages:
-                # - time estimates will always be over not underestimates, which is more useful for planning
-                # - to know the size of a batch when going through the list, you know the first one is always the batch padded context length.
-                #   this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement
-                # - any OOMs will happen right away rather than near the end
-
-                toks = x[1] + x[2]
-                return (-len(toks), tuple(toks))
-            
-            # TODO: automatic (variable) batch size detection for vectorization
-            reord = utils.Reorderer(requests, _collate)
-            for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
-                inps = []
-                contlens = []
-                inplens = []
-
-                padding_length = None
-
-                # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
-                # tensors, then we pack them together into a batch, call the model, and then pick it all apart
-                # again because vectorizing is annoying
-
-                for _, context_enc, continuation_enc in chunk:
-                    # sanity check
-                    assert len(context_enc) > 0
-                    assert len(continuation_enc) > 0
-                    assert len(continuation_enc) <= self.max_length
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id

-                    # how this all works:
-                    #          CTX      CONT
-                    # inp    0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
-                    # gpt2    \               \
-                    # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice
-                    # cont_toks      4 5 6 7 8 9
-
-                    # when too long to fit in context, truncate from the left
-                    inp = torch.tensor(
-                        (context_enc + continuation_enc)[-(self.max_length+1):][:-1]
-                    , dtype=torch.long).to(self.device)
-                    inplen, = inp.shape
-
-                    cont = continuation_enc
-
-                    # since in _collate we make sure length is descending, the longest is always the first one.
-                    padding_length = padding_length if padding_length is not None else inplen
-
-                    # pad to length
-                    inp = torch.cat([
-                        inp, # [seq]
-                        torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
-                    ], dim=0)
-
-                    inps.append(inp.unsqueeze(0))
-                    contlens.append(cont)
-                    inplens.append(inplen)
-
-                multi_logits = F.log_softmax(self._model_call(torch.cat(inps, dim=0)), dim=-1).cpu()  # [batch, seq, vocab]
-
-                for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
-                    contlen = len(cont_toks)
-
-                    logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
-
-                    greedy_tokens = logits.argmax(dim=-1)
-
-                    # cont_toks :: [1, seq]
-                    cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)
-
-                    max_equal = (greedy_tokens == cont_toks).all()
-
-                    #last_token_slice = logits[:, -1, :].squeeze(0).tolist()
-
-                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
+    @property
+    def max_length(self):
+        try:
+            return self.gpt2.config.n_ctx
+        except AttributeError:
+            # gptneoconfig doesn't have n_ctx apparently
+            return self.gpt2.config.max_position_embeddings

-                    answer = (float(logits.sum()), bool(max_equal))
+    @property
+    def max_gen_toks(self):
+        return 256

-                    # partial caching
-                    if cache_key is not None:
-                        self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+    @property
+    def batch_size(self):
+        # TODO: fix multi-gpu
+        return self.batch_size_per_gpu  # * gpus

-                    res.append(answer)
+    @property
+    def device(self):
+        # TODO: fix multi-gpu
+        return self._device

-        return reord.get_original(res)
+    def tok_encode(self, string: str):
+        return self.tokenizer.encode(string, add_special_tokens=False)
    
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+
    def _model_call(self, inps):
        """
        inps: a torch tensor of shape [batch, sequence]
        the size of sequence may vary from call to call

        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits retuned from the model
+        logits returned from the model
        """
-        return self.gpt2(inps)[0][:, :, :50257]
+        with torch.no_grad():
+            return self.gpt2(inps)[0][:, :, :50257]
    
-    def greedy_until(self, requests):
-        # TODO: implement fully general `until` that handles untils that are 
-        # multiple tokens or that span multiple tokens correctly
-        res = []
-
-        def _collate(x):
-            toks = self.tokenizer.encode(x[0])
-            return (len(toks), x[0])
-        
-        reord = utils.Reorderer(requests, _collate)
-
-        for context, until in tqdm(reord.get_reordered()):
-            if isinstance(until, str): until = [until]
-
-            context_enc = torch.tensor([self.tokenizer.encode(context)[self.MAX_GEN_TOKS - self.max_length:]]).to(self.device)
-
-            primary_until, = self.tokenizer.encode(until[0])
-
-            cont = self.gpt2.generate(
-                context_enc,
-                max_length=context_enc.shape[1] + self.MAX_GEN_TOKS,
-                eos_token_id=primary_until,
-                do_sample=False
-            )
-
-            s = self.tokenizer.decode(cont[0].tolist()[context_enc.shape[1]:])
-
-            for term in until:
-                s = s.split(term)[0]
-            
-            # partial caching
-            self.cache_hook.add_partial("greedy_until", (context, until), s)
-            
-            res.append(s)
-        
-        return reord.get_original(res)
+    def _model_generate(self, context, max_length, eos_token_id):
+        return self.gpt2.generate(
+            context,
+            max_length=max_length,
+            eos_token_id=eos_token_id,
+            do_sample=False
+        )
+
+
+# for backwards compatibility
+GPT2LM = HFLM
--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
 import os
 import numpy as np
 import transformers
-from lm_eval.base import LM
+from lm_eval.base import BaseLM
 from lm_eval import utils
 from tqdm import tqdm
 import time


 def get_result(response, ctxlen):
+    """Process results from OpenAI API response.
+
+    :param response: dict
+        OpenAI API Response
+    :param ctxlen: int
+        Length of context (so we can slice them away and only keep the predictions)
+    :return:
+        continuation_logprobs: np.array
+            Log probabilities of continuation tokens
+        is_greedy: bool
+            whether argmax matches given continuation exactly
+    """
    is_greedy = True
    logprobs = response["logprobs"]["token_logprobs"]
    continuation_logprobs = sum(logprobs[ctxlen:])
@@ -24,8 +36,11 @@ def get_result(response, ctxlen):


 def oa_completion(**kwargs):
-    import openai
+    """ Query OpenAI API for completion.

+    Retry with back-off until they respond
+    """
+    import openai
    backoff_time = 3
    while True:
        try:
@@ -35,11 +50,8 @@ def oa_completion(**kwargs):
            backoff_time *= 1.5


-class GPT3LM(LM):
-
-    MAX_LENGTH = 2048
+class GPT3LM(BaseLM):
    REQ_CHUNK_SIZE = 20
-    MAX_GEN_TOKS = 256

    def __init__(self, engine, truncate=False):
        """
@@ -50,10 +62,12 @@ class GPT3LM(LM):
            Truncate input if too long (if False and input is too long, throw error)
        """
        super().__init__()
+
        import openai
        self.engine = engine
        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')

+        self.vocab_size = self.tokenizer.vocab_size

        # to make the annoying "Using pad_token, but it is not set yet." error go away
        self.tokenizer.pad_token = "<|endoftext|>"
@@ -64,53 +78,36 @@ class GPT3LM(LM):
        # Read from environment variable OPENAI_API_SECRET_KEY
        openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]

-    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config={}):
-        args = utils.simple_parse_args_string(arg_string)
-        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        return cls(**args, **args2)
-
-    def loglikelihood(self, requests):
-        new_reqs = []
-        for context, continuation in requests:
-            if context == "":
-                # end of text as context
-                context_enc = [50256]
-            else:
-                context_enc = self.tokenizer.encode(context)
-
-            continuation_enc = self.tokenizer.encode(continuation)
-
-            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-
-        return self._loglikelihood_tokens(new_reqs)
-
-    def loglikelihood_rolling(self, requests):
-        # TODO: switch implementation to use _loglikelihood_tokens rather than having it do its own thing
-
-        loglikelihoods = []
-        for string, in tqdm(requests):
-            encoded = self.tokenizer.encode_plus(string)["input_ids"]
-            rolling_token_windows = utils.get_rolling_token_windows(
-                token_list=encoded,
-                prefix_token=self.end_of_text_token_id,
-                max_seq_len=self.MAX_LENGTH,
-                context_len=1,
-            )
-            string_loglikelihoods = []
-            for input_tokens, pred_tokens in rolling_token_windows:
-                block_output = self.get_token_logprobs(
-                    input_tokens=input_tokens,
-                    pred_tokens=pred_tokens,
-                )
-                string_loglikelihoods.append(block_output["logprobs"])
-            string_loglikelihoods = np.concatenate(string_loglikelihoods).sum()
-            loglikelihoods.append(string_loglikelihoods)
-
-        return loglikelihoods
-
-    def _loglikelihood_tokens(self, requests):
-        import openai
+    @property
+    def eot_token_id(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self):
+        # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
+        return 2048
+
+    @property
+    def max_gen_toks(self):
+        return 256
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def tok_encode(self, string: str):
+        return self.tokenizer.encode(string, add_special_tokens=False)
+    
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
        res = []

        def _collate(x):
@@ -118,16 +115,18 @@ class GPT3LM(LM):
            # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
            # we care about and so we need some kind of backup for when it isn't
            toks = x[1] + x[2]
-            return (-len(toks), tuple(toks))
+            return -len(toks), tuple(toks)
        
        reord = utils.Reorderer(requests, _collate)

-        for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
+        for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)), disable=disable_tqdm):
            inps = []
            ctxlens = []
            for cache_key, context_enc, continuation_enc in chunk:
-                inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:]
-                ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH)
+                # max_length+1 because the API takes up to 2049 tokens, including the first context token
+                inp = (context_enc + continuation_enc)[-(self.max_length+1):]
+                # TODO: the logic is much simpler if we just look at the length of continuation tokens
+                ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - (self.max_length+1))

                inps.append(inp)
                ctxlens.append(ctxlen)
@@ -151,35 +150,14 @@ class GPT3LM(LM):

        return reord.get_original(res)

-    def get_token_logprobs(self, input_tokens, pred_tokens):
-        pred_start = len(input_tokens) - len(pred_tokens) + 1
-        # We're going to stitch together the input_tokens and pred_tokens
-        # In the longest case, this gets us to length = max_seq_len+1 (which the API works with)
-        assert input_tokens[pred_start:] == pred_tokens[:-1]
-        token_ids = input_tokens + [pred_tokens[-1]]
-        response = oa_completion(
-            engine=self.engine,
-            prompt=token_ids,
-            max_tokens=0,
-            temperature=0.0,
-            logprobs=0,
-            echo=True,
-        )
-        logprobs = np.array(response["choices"][0]["logprobs"]["token_logprobs"][pred_start:])
-        positions = np.arange(pred_start-1, pred_start-1 + len(token_ids[pred_start:]))
-        return {
-            "logprobs": logprobs,
-            "positions": positions,
-        }
-
    def greedy_until(self, requests):
-        if not requests: return []
-        import openai
+        if not requests:
+            return []
        res = []

        def _collate(x):
-            toks = self.tokenizer.encode(x[0])
-            return (len(toks), x[0])
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
        
        reord = utils.Reorderer(requests, _collate)

@@ -193,34 +171,43 @@ class GPT3LM(LM):
                    lastuntil = x[1]
                ret.append(x)
            
-            if ret: yield ret, lastuntil
+            if ret:
+                yield ret, lastuntil

-        # todo: more intelligent batching for heterogenous `until`
+        # todo: more intelligent batching for heterogeneous `until`
        for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
            inps = []
            for context, _ in chunk:
-                context_enc = self.tokenizer.encode(context)
-                inp = context_enc[-(self.MAX_LENGTH - self.MAX_GEN_TOKS):]
+                context_enc = self.tok_encode(context)
+                inp = context_enc[-(self.max_length - self.max_gen_toks):]
                inps.append(inp)

            response = oa_completion(
                engine=self.engine,
                prompt=inps,
-                max_tokens=self.MAX_GEN_TOKS, 
+                max_tokens=self.max_gen_toks, 
                temperature=0.,
                logprobs=10,
-                stop=until
+                stop=until,
            )

-            for resp, (context, until) in zip(response.choices, chunk):
+            for resp, (context, until_) in zip(response.choices, chunk):
                s = resp['text']

-                for term in until:
+                for term in until_:
                    s = s.split(term)[0]

                # partial caching
-                self.cache_hook.add_partial("greedy_until", (context, until), s)
+                self.cache_hook.add_partial("greedy_until", (context, until_), s)
                
                res.append(s)
        
        return reord.get_original(res)
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override greedy_until
+        raise NotImplementedError()
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -22,6 +22,7 @@ from . import lambada
 from . import race
 from . import piqa
 from . import prost
+from . import mc_taco
 from . import triviaqa
 from . import pubmedqa
 from . import sciq
@@ -42,6 +43,11 @@ from . import pile
 from . import wikitext
 from . import xquad
 from . import mlqa
+from . import lambada_multilingual
+from . import mutual
+from . import truthfulqa
+from . import blimp
+from . import asdiv

 ########################################
 # Translation tasks
@@ -99,12 +105,17 @@ TASK_REGISTRY = {
    "drop": drop.DROP,
    "lambada": lambada.LAMBADA,
    "lambada_cloze": lambada_cloze.LAMBADA_cloze,
+    
+    # multilingual lambada
+    **lambada_multilingual.construct_tasks(),
+
    "wikitext": wikitext.WikiText,
    # "cbt-cn": cbt.CBTCN, # disabled pending context length fix
    # "cbt-ne": cbt.CBTNE, # disabled pending context length fix

    "piqa": piqa.PiQA,
    "prost": prost.PROST,
+    "mc_taco": mc_taco.MCTACO,

    # Science related
    "pubmedqa" : pubmedqa.Pubmed_QA,
@@ -143,7 +154,9 @@ TASK_REGISTRY = {

    "race": race.RACE,
    # "naturalqs": naturalqs.NaturalQs, # not implemented yet
-    "headqa": headqa.HeadQA,
+    "headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es
+    "headqa_es": headqa.HeadQAEs,
+    "headqa_en": headqa.HeadQAEn,
    "mathqa": mathqa.MathQA,
    "webqs": webqs.WebQs,
    "wsc273": wsc273.WinogradSchemaChallenge273,
@@ -159,6 +172,13 @@ TASK_REGISTRY = {
    "ethics_utilitarianism": hendrycks_ethics.EthicsUtilitarianism,
    "ethics_virtue": hendrycks_ethics.EthicsVirtue,

+     "truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice,
+     "truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
+
+    # dialogue
+    "mutual": mutual.MuTual,
+    "mutual_plus": mutual.MuTualPlus,
+
    # math
    "math_algebra": hendrycks_math.MathAlgebra,
    "math_counting_and_prob": hendrycks_math.MathCountingAndProbability,
@@ -167,6 +187,7 @@ TASK_REGISTRY = {
    "math_num_theory": hendrycks_math.MathNumberTheory,
    "math_prealgebra": hendrycks_math.MathPrealgebra,
    "math_precalc": hendrycks_math.MathPrecalculus,
+    "math_asdiv": asdiv.Asdiv,

    # arithmetic
    "arithmetic_2da": arithmetic.Arithmetic2DPlus,
@@ -220,6 +241,75 @@ TASK_REGISTRY = {
    "pile_ubuntu-irc": pile.PileUbuntuIrc,
    "pile_wikipedia": pile.PileWikipedia,
    "pile_youtubesubtitles": pile.PileYoutubeSubtitles,
+    
+    # BLiMP
+    "blimp_adjunct_island": blimp.BlimpAdjunctIsland,
+    "blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement,
+    "blimp_anaphor_number_agreement": blimp.BlimpAnaphorNumberAgreement,
+    "blimp_animate_subject_passive": blimp.BlimpAnimateSubjectPassive,
+    "blimp_animate_subject_trans": blimp.BlimpAnimateSubjectTrans,
+    "blimp_causative": blimp.BlimpCausative,
+    "blimp_complex_NP_island": blimp.BlimpComplex_NPIsland,
+    "blimp_coordinate_structure_constraint_complex_left_branch": blimp.BlimpCoordinateStructureConstraintComplexLeftBranch,
+    "blimp_coordinate_structure_constraint_object_extraction": blimp.BlimpCoordinateStructureConstraintObjectExtraction,
+    "blimp_determiner_noun_agreement_1": blimp.BlimpDeterminerNounAgreement_1,
+    "blimp_determiner_noun_agreement_2": blimp.BlimpDeterminerNounAgreement_2,
+    "blimp_determiner_noun_agreement_irregular_1": blimp.BlimpDeterminerNounAgreementIrregular_1,
+    "blimp_determiner_noun_agreement_irregular_2": blimp.BlimpDeterminerNounAgreementIrregular_2,
+    "blimp_determiner_noun_agreement_with_adj_2": blimp.BlimpDeterminerNounAgreementWithAdj_2,
+    "blimp_determiner_noun_agreement_with_adj_irregular_1": blimp.BlimpDeterminerNounAgreementWithAdjIrregular_1,
+    "blimp_determiner_noun_agreement_with_adj_irregular_2": blimp.BlimpDeterminerNounAgreementWithAdjIrregular_2,
+    "blimp_determiner_noun_agreement_with_adjective_1": blimp.BlimpDeterminerNounAgreementWithAdjective_1,
+    "blimp_distractor_agreement_relational_noun": blimp.BlimpDistractorAgreementRelationalNoun,
+    "blimp_distractor_agreement_relative_clause": blimp.BlimpDistractorAgreementRelativeClause,
+    "blimp_drop_argument": blimp.BlimpDropArgument,
+    "blimp_ellipsis_n_bar_1": blimp.BlimpEllipsisNBar_1,
+    "blimp_ellipsis_n_bar_2": blimp.BlimpEllipsisNBar_2,
+    "blimp_existential_there_object_raising": blimp.BlimpExistentialThereObjectRaising,
+    "blimp_existential_there_quantifiers_1": blimp.BlimpExistentialThereQuantifiers_1,
+    "blimp_existential_there_quantifiers_2": blimp.BlimpExistentialThereQuantifiers_2,
+    "blimp_existential_there_subject_raising": blimp.BlimpExistentialThereSubjectRaising,
+    "blimp_expletive_it_object_raising": blimp.BlimpExpletiveItObjectRaising,
+    "blimp_inchoative": blimp.BlimpInchoative,
+    "blimp_intransitive": blimp.BlimpIntransitive,
+    "blimp_irregular_past_participle_adjectives": blimp.BlimpIrregularPastParticipleAdjectives,
+    "blimp_irregular_past_participle_verbs": blimp.BlimpIrregularPastParticipleVerbs,
+    "blimp_irregular_plural_subject_verb_agreement_1": blimp.BlimpIrregularPluralSubjectVerbAgreement_1,
+    "blimp_irregular_plural_subject_verb_agreement_2": blimp.BlimpIrregularPluralSubjectVerbAgreement_2,
+    "blimp_left_branch_island_echo_question": blimp.BlimpLeftBranchIslandEchoQuestion,
+    "blimp_left_branch_island_simple_question": blimp.BlimpLeftBranchIslandSimpleQuestion,
+    "blimp_matrix_question_npi_licensor_present": blimp.BlimpMatrixQuestionNpiLicensorPresent,
+    "blimp_npi_present_1": blimp.BlimpNpiPresent_1,
+    "blimp_npi_present_2": blimp.BlimpNpiPresent_2,
+    "blimp_only_npi_licensor_present": blimp.BlimpOnlyNpiLicensorPresent,
+    "blimp_only_npi_scope": blimp.BlimpOnlyNpiScope,
+    "blimp_passive_1": blimp.BlimpPassive_1,
+    "blimp_passive_2": blimp.BlimpPassive_2,
+    "blimp_principle_A_c_command": blimp.BlimpPrinciple_ACCommand,
+    "blimp_principle_A_case_1": blimp.BlimpPrinciple_ACase_1,
+    "blimp_principle_A_case_2": blimp.BlimpPrinciple_ACase_2,
+    "blimp_principle_A_domain_1": blimp.BlimpPrinciple_ADomain_1,
+    "blimp_principle_A_domain_2": blimp.BlimpPrinciple_ADomain_2,
+    "blimp_principle_A_domain_3": blimp.BlimpPrinciple_ADomain_3,
+    "blimp_principle_A_reconstruction": blimp.BlimpPrinciple_AReconstruction,
+    "blimp_regular_plural_subject_verb_agreement_1": blimp.BlimpRegularPluralSubjectVerbAgreement_1,
+    "blimp_regular_plural_subject_verb_agreement_2": blimp.BlimpRegularPluralSubjectVerbAgreement_2,
+    "blimp_sentential_negation_npi_licensor_present": blimp.BlimpSententialNegationNpiLicensorPresent,
+    "blimp_sentential_negation_npi_scope": blimp.BlimpSententialNegationNpiScope,
+    "blimp_sentential_subject_island": blimp.BlimpSententialSubjectIsland,
+    "blimp_superlative_quantifiers_1": blimp.BlimpSuperlativeQuantifiers_1,
+    "blimp_superlative_quantifiers_2": blimp.BlimpSuperlativeQuantifiers_2,
+    "blimp_tough_vs_raising_1": blimp.BlimpToughVsRaising_1,
+    "blimp_tough_vs_raising_2": blimp.BlimpToughVsRaising_2,
+    "blimp_transitive": blimp.BlimpTransitive,
+    "blimp_wh_island": blimp.BlimpWhIsland,
+    "blimp_wh_questions_object_gap": blimp.BlimpWhQuestionsObjectGap,
+    "blimp_wh_questions_subject_gap": blimp.BlimpWhQuestionsSubjectGap,
+    "blimp_wh_questions_subject_gap_long_distance": blimp.BlimpWhQuestionsSubjectGapLongDistance,
+    "blimp_wh_vs_that_no_gap": blimp.BlimpWhVsThatNoGap,
+    "blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance,
+    "blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
+    "blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
 }



--- a/lm_eval/tasks/asdiv.py
+++ b/lm_eval/tasks/asdiv.py
+"""
+ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers
+https://arxiv.org/abs/2106.15772
+
+@misc{miao2021diverse,
+      title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
+      author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
+      year={2021},
+      eprint={2106.15772},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI}
+}
+"""
+from lm_eval.base import Task
+from pathlib import Path
+from best_download import download_file 
+import xml.etree.ElementTree as ET
+from lm_eval.base import rf
+from lm_eval.metrics import mean,perplexity
+import numpy as np
+from zipfile import ZipFile
+import os 
+
+#currently ignoring formula for answer generation
+
+# given a subset, splits return the docs 
+class Asdiv(Task):
+    VERSION = 0
+    DATASET_PATH = Path("data/asdiv")
+
+    def download(self):
+        if self.DATASET_PATH.exists():
+            return
+        Path.mkdir(self.DATASET_PATH)
+        url = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
+        checksum = "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"
+        zip_path = self.DATASET_PATH / "55790e5270bb91ccfa5053194b25732534696b50.zip"
+        download_file(url, str(zip_path), checksum)
+        with ZipFile(zip_path, "r") as zip:
+            zip.extractall(self.DATASET_PATH)
+        os.remove(zip_path)
+
+    def _convert_standard(self, problem):
+        #TODO: include solution-type and formula
+        out_doc = {
+            "question" : problem.find('Question').text,
+            "body" : problem.find('Body').text,
+            "answer": problem.find('Answer').text
+        }
+        return out_doc
+
+    def load_docs(self, textfilename, tfds=False):
+        tree = ET.parse(textfilename)
+        root = tree.getroot()
+        for pid, problem in enumerate(root.iter('Problem')):
+            out_doc = self._convert_standard(problem)
+            yield out_doc
+
+    def has_training_docs(self):
+        return False
+    
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        raise NotImplementedError("This dataset has no training docs")
+
+    def test_docs(self):
+        raise NotImplementedError("This dataset has no test docs")
+
+    def validation_docs(self):
+        data_xml_path = self.DATASET_PATH / "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50/dataset/ASDiv.xml"
+        return self.load_docs(data_xml_path)
+
+    def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
+        assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting."
+        return super().fewshot_context(doc, num_fewshot, provide_description, rnd)
+
+    
+    def fewshot_description(self):
+        # TODO: add solution-type and formula
+        desc = "information containing the context of the question\nQuestion: Text of a question.\nAnswer: Answer to the question, based on the passage.\n"
+        return desc
+
+    def doc_to_text(self, doc):
+        # TODO: add solution-type
+        return doc['body'] + '\n' + 'Question:' + doc['question'] + '\n' + 'Answer:'
+
+    def doc_to_target(self, doc):
+        # TODO: add formula
+
+        answer = doc['answer'].split(' (')[0]
+        return " " + answer
+
+    def construct_requests(self, doc, ctx):
+        ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
+        return ll, is_greedy
+    
+    def process_results(self, doc, results):
+        ll, is_greedy = results
+
+        return {
+            'acc': int(is_greedy)
+        }
+        
+    def aggregation(self):
+        return {
+            'acc': mean
+        }
+
+    def higher_is_better(self):
+        return {
+            'acc': True
+        }
+
--- a/lm_eval/tasks/blimp.py
+++ b/lm_eval/tasks/blimp.py
+"""
+BLiMP: A Benchmark of Linguistic Minimal Pairs for English
+https://arxiv.org/abs/1912.00582
+
+@article{warstadt2019blimp,
+  title={BLiMP: A Benchmark of Linguistic Minimal Pairs for English},
+  author={Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei, and Wang, Sheng-Fu and Bowman, Samuel R},
+  journal={arXiv preprint arXiv:1912.00582},
+  year={2019}
+}
+"""
+
+from lm_eval.base import rf
+from lm_eval.metrics import mean
+from .common import HFTask
+
+
+class BlimpTask(HFTask):
+    VERSION = 0
+    DATASET_PATH = "blimp"
+
+    def download(self):
+        super().download()
+
+        # The HF dataset only contains a "train" dataset, but the harness expects a "validation"
+        # dataset. Let's use the training dataset, on the assumption that the model wasn't actually
+        # trained on this data.
+
+        self.data["validation"] = self.data["train"]
+        del self.data["train"]
+
+    def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
+        assert num_fewshot == 0
+        assert not provide_description
+        return ""
+
+    def doc_to_text(self, doc):
+        # this method is invoked by tests only
+        return ""
+
+    def doc_to_target(self, doc):
+        # this method is invoked by tests only
+        return ""
+
+    def construct_requests(self, doc, ctx):
+        assert not ctx
+
+        # Calculate the loglikelihood for the good and the bad sentence.
+        # Note that loglikelihood translates the "" prefix to the "<|endoftext|>" token
+        return [
+            rf.loglikelihood("", doc["sentence_good"]),
+            rf.loglikelihood("", doc["sentence_bad"]),
+        ]
+
+    def process_results(self, doc, results):
+        likelihood1, likelihood2 = results
+
+        # the model got this case right iff the good sentence scored higher than the bad sentence
+        acc = 1.0 if likelihood1 > likelihood2 else 0.0
+
+        return {
+            "acc": acc,
+        }
+
+    def higher_is_better(self):
+        return {
+            "acc": True,
+        }
+
+    def aggregation(self):
+        return {
+            "acc": mean,
+        }
+
+
+class BlimpAdjunctIsland(BlimpTask):
+    DATASET_NAME = "adjunct_island"
+
+
+class BlimpAnaphorGenderAgreement(BlimpTask):
+    DATASET_NAME = "anaphor_gender_agreement"
+
+
+class BlimpAnaphorNumberAgreement(BlimpTask):
+    DATASET_NAME = "anaphor_number_agreement"
+
+
+class BlimpAnimateSubjectPassive(BlimpTask):
+    DATASET_NAME = "animate_subject_passive"
+
+
+class BlimpAnimateSubjectTrans(BlimpTask):
+    DATASET_NAME = "animate_subject_trans"
+
+
+class BlimpCausative(BlimpTask):
+    DATASET_NAME = "causative"
+
+
+class BlimpComplex_NPIsland(BlimpTask):
+    DATASET_NAME = "complex_NP_island"
+
+
+class BlimpCoordinateStructureConstraintComplexLeftBranch(BlimpTask):
+    DATASET_NAME = "coordinate_structure_constraint_complex_left_branch"
+
+
+class BlimpCoordinateStructureConstraintObjectExtraction(BlimpTask):
+    DATASET_NAME = "coordinate_structure_constraint_object_extraction"
+
+
+class BlimpDeterminerNounAgreement_1(BlimpTask):
+    DATASET_NAME = "determiner_noun_agreement_1"
+
+
+class BlimpDeterminerNounAgreement_2(BlimpTask):
+    DATASET_NAME = "determiner_noun_agreement_2"
+
+
+class BlimpDeterminerNounAgreementIrregular_1(BlimpTask):
+    DATASET_NAME = "determiner_noun_agreement_irregular_1"
+
+
+class BlimpDeterminerNounAgreementIrregular_2(BlimpTask):
+    DATASET_NAME = "determiner_noun_agreement_irregular_2"
+
+
+class BlimpDeterminerNounAgreementWithAdj_2(BlimpTask):
+    DATASET_NAME = "determiner_noun_agreement_with_adj_2"
+
+
+class BlimpDeterminerNounAgreementWithAdjIrregular_1(BlimpTask):
+    DATASET_NAME = "determiner_noun_agreement_with_adj_irregular_1"
+
+
+class BlimpDeterminerNounAgreementWithAdjIrregular_2(BlimpTask):
+    DATASET_NAME = "determiner_noun_agreement_with_adj_irregular_2"
+
+
+class BlimpDeterminerNounAgreementWithAdjective_1(BlimpTask):
+    DATASET_NAME = "determiner_noun_agreement_with_adjective_1"
+
+
+class BlimpDistractorAgreementRelationalNoun(BlimpTask):
+    DATASET_NAME = "distractor_agreement_relational_noun"
+
+
+class BlimpDistractorAgreementRelativeClause(BlimpTask):
+    DATASET_NAME = "distractor_agreement_relative_clause"
+
+
+class BlimpDropArgument(BlimpTask):
+    DATASET_NAME = "drop_argument"
+
+
+class BlimpEllipsisNBar_1(BlimpTask):
+    DATASET_NAME = "ellipsis_n_bar_1"
+
+
+class BlimpEllipsisNBar_2(BlimpTask):
+    DATASET_NAME = "ellipsis_n_bar_2"
+
+
+class BlimpExistentialThereObjectRaising(BlimpTask):
+    DATASET_NAME = "existential_there_object_raising"
+
+
+class BlimpExistentialThereQuantifiers_1(BlimpTask):
+    DATASET_NAME = "existential_there_quantifiers_1"
+
+
+class BlimpExistentialThereQuantifiers_2(BlimpTask):
+    DATASET_NAME = "existential_there_quantifiers_2"
+
+
+class BlimpExistentialThereSubjectRaising(BlimpTask):
+    DATASET_NAME = "existential_there_subject_raising"
+
+
+class BlimpExpletiveItObjectRaising(BlimpTask):
+    DATASET_NAME = "expletive_it_object_raising"
+
+
+class BlimpInchoative(BlimpTask):
+    DATASET_NAME = "inchoative"
+
+
+class BlimpIntransitive(BlimpTask):
+    DATASET_NAME = "intransitive"
+
+
+class BlimpIrregularPastParticipleAdjectives(BlimpTask):
+    DATASET_NAME = "irregular_past_participle_adjectives"
+
+
+class BlimpIrregularPastParticipleVerbs(BlimpTask):
+    DATASET_NAME = "irregular_past_participle_verbs"
+
+
+class BlimpIrregularPluralSubjectVerbAgreement_1(BlimpTask):
+    DATASET_NAME = "irregular_plural_subject_verb_agreement_1"
+
+
+class BlimpIrregularPluralSubjectVerbAgreement_2(BlimpTask):
+    DATASET_NAME = "irregular_plural_subject_verb_agreement_2"
+
+
+class BlimpLeftBranchIslandEchoQuestion(BlimpTask):
+    DATASET_NAME = "left_branch_island_echo_question"
+
+
+class BlimpLeftBranchIslandSimpleQuestion(BlimpTask):
+    DATASET_NAME = "left_branch_island_simple_question"
+
+
+class BlimpMatrixQuestionNpiLicensorPresent(BlimpTask):
+    DATASET_NAME = "matrix_question_npi_licensor_present"
+
+
+class BlimpNpiPresent_1(BlimpTask):
+    DATASET_NAME = "npi_present_1"
+
+
+class BlimpNpiPresent_2(BlimpTask):
+    DATASET_NAME = "npi_present_2"
+
+
+class BlimpOnlyNpiLicensorPresent(BlimpTask):
+    DATASET_NAME = "only_npi_licensor_present"
+
+
+class BlimpOnlyNpiScope(BlimpTask):
+    DATASET_NAME = "only_npi_scope"
+
+
+class BlimpPassive_1(BlimpTask):
+    DATASET_NAME = "passive_1"
+
+
+class BlimpPassive_2(BlimpTask):
+    DATASET_NAME = "passive_2"
+
+
+class BlimpPrinciple_ACCommand(BlimpTask):
+    DATASET_NAME = "principle_A_c_command"
+
+
+class BlimpPrinciple_ACase_1(BlimpTask):
+    DATASET_NAME = "principle_A_case_1"
+
+
+class BlimpPrinciple_ACase_2(BlimpTask):
+    DATASET_NAME = "principle_A_case_2"
+
+
+class BlimpPrinciple_ADomain_1(BlimpTask):
+    DATASET_NAME = "principle_A_domain_1"
+
+
+class BlimpPrinciple_ADomain_2(BlimpTask):
+    DATASET_NAME = "principle_A_domain_2"
+
+
+class BlimpPrinciple_ADomain_3(BlimpTask):
+    DATASET_NAME = "principle_A_domain_3"
+
+
+class BlimpPrinciple_AReconstruction(BlimpTask):
+    DATASET_NAME = "principle_A_reconstruction"
+
+
+class BlimpRegularPluralSubjectVerbAgreement_1(BlimpTask):
+    DATASET_NAME = "regular_plural_subject_verb_agreement_1"
+
+
+class BlimpRegularPluralSubjectVerbAgreement_2(BlimpTask):
+    DATASET_NAME = "regular_plural_subject_verb_agreement_2"
+
+
+class BlimpSententialNegationNpiLicensorPresent(BlimpTask):
+    DATASET_NAME = "sentential_negation_npi_licensor_present"
+
+
+class BlimpSententialNegationNpiScope(BlimpTask):
+    DATASET_NAME = "sentential_negation_npi_scope"
+
+
+class BlimpSententialSubjectIsland(BlimpTask):
+    DATASET_NAME = "sentential_subject_island"
+
+
+class BlimpSuperlativeQuantifiers_1(BlimpTask):
+    DATASET_NAME = "superlative_quantifiers_1"
+
+
+class BlimpSuperlativeQuantifiers_2(BlimpTask):
+    DATASET_NAME = "superlative_quantifiers_2"
+
+
+class BlimpToughVsRaising_1(BlimpTask):
+    DATASET_NAME = "tough_vs_raising_1"
+
+
+class BlimpToughVsRaising_2(BlimpTask):
+    DATASET_NAME = "tough_vs_raising_2"
+
+
+class BlimpTransitive(BlimpTask):
+    DATASET_NAME = "transitive"
+
+
+class BlimpWhIsland(BlimpTask):
+    DATASET_NAME = "wh_island"
+
+
+class BlimpWhQuestionsObjectGap(BlimpTask):
+    DATASET_NAME = "wh_questions_object_gap"
+
+
+class BlimpWhQuestionsSubjectGap(BlimpTask):
+    DATASET_NAME = "wh_questions_subject_gap"
+
+
+class BlimpWhQuestionsSubjectGapLongDistance(BlimpTask):
+    DATASET_NAME = "wh_questions_subject_gap_long_distance"
+
+
+class BlimpWhVsThatNoGap(BlimpTask):
+    DATASET_NAME = "wh_vs_that_no_gap"
+
+
+class BlimpWhVsThatNoGapLongDistance(BlimpTask):
+    DATASET_NAME = "wh_vs_that_no_gap_long_distance"
+
+
+class BlimpWhVsThatWithGap(BlimpTask):
+    DATASET_NAME = "wh_vs_that_with_gap"
+
+
+class BlimpWhVsThatWithGapLongDistance(BlimpTask):
+    DATASET_NAME = "wh_vs_that_with_gap_long_distance"
--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -14,15 +14,16 @@ Acknowledgement: This implementation is based on the official evaluation for `DR
 https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
 """

+_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)

 class DROP(Task):
-    VERSION = 0
+    VERSION = 1
    DATASET_PATH = Path("data/drop")

    def download(self):
        if self.DATASET_PATH.exists():
            return
-        Path.mkdir(self.DATASET_PATH)
+        Path.mkdir(self.DATASET_PATH, parents=True)
        url = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
        checksum = "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"
        zip_path = self.DATASET_PATH / "drop_dataset.zip"
@@ -50,19 +51,34 @@ class DROP(Task):
                    "id": qa["query_id"],
                    "passage": doc["passage"],
                    "question": qa["question"],
-                    "answers": self.get_answers(qa["answer"]),
+                    "answers": self.get_answers(qa),
                }

    @classmethod
-    def get_answers(cls, answers):
-        # NOTE: We wrap every non-`list` answer into a list for uniformity.
-        if answers["number"] != "":
-            return [str(answers["number"])]
-        if answers["spans"] != []:
-            return answers["spans"]
-        return [" ".join([answers["date"]["day"],
-                          answers["date"]["month"],
-                          answers["date"]["year"]]).strip()]
+    def get_answers(cls, qa):
+        answers = []
+        answers_set = set()
+
+        candidates = [qa["answer"]] + qa.get("validated_answers", [])
+        for candidate in candidates:
+            answer = cls.parse_answer(candidate)
+            if answer in answers_set:
+                continue
+            answers_set.add(answer)
+            answers.append(answer)
+
+        return answers
+
+    @classmethod
+    def parse_answer(cls, answer):
+        # NOTE: Everything is returned as a tuple for uniformity and hashability.
+        if answer["number"] != "":
+            return (str(answer["number"]),)
+        if answer["spans"] != []:
+            return tuple(answer["spans"])
+        return (" ".join([answer["date"]["day"],
+                          answer["date"]["month"],
+                          answer["date"]["year"]]).strip(),)

    def training_docs(self):
        docs = json.load(open(self.DATASET_PATH / "drop_dataset" / "drop_dataset_train.json"))
@@ -76,7 +92,7 @@ class DROP(Task):
        return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"

    def doc_to_target(self, doc):
-        return " " + ", ".join(doc["answers"])
+        return " " + ", ".join(doc["answers"][0])

    def construct_requests(self, doc, ctx):
        """Uses RequestFactory to construct Requests and returns an iterable of
@@ -89,9 +105,7 @@ class DROP(Task):
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
        """
-        conts = []
-        for _ in doc["answers"]:
-            conts.append(rf.greedy_until(ctx, ["."]))
+        conts = [rf.greedy_until(ctx, ["."])]
        return conts

    def process_results(self, doc, results):
@@ -105,66 +119,96 @@ class DROP(Task):
            The results of the requests created in construct_requests.
        """
        preds, golds = results, doc["answers"]
-        exact_match, f1_score = self.get_metrics(preds, golds)
+        max_em = 0
+        max_f1 = 0
+        for gold_answer in golds:
+            exact_match, f1_score = self.get_metrics(preds, gold_answer)
+            if gold_answer[0].strip():
+                max_em = max(max_em, exact_match)
+                max_f1 = max(max_f1, f1_score)
        return {
-            "em": exact_match,
-            "f1": f1_score
+            "em": max_em,
+            "f1": max_f1
        }

-    def get_metrics(self, preds, golds):
-        exact_match = self._exact_match(preds, golds)
-        f1_score = self._f1_score(preds, golds)
-        return exact_match, f1_score
-
-    def _exact_match(self, preds, golds):
-        """ Returns the exact match of normalized gold answers and predictions. """
-        normalized_preds = [self._normalize(pred) for pred in preds]
-        normalized_golds = [self._normalize(gold) for gold in golds]
-        is_equal_sets = set(normalized_preds) == set(normalized_golds)
-        is_equal_length = len(normalized_preds) == len(normalized_golds)
-        return int(is_equal_sets and is_equal_length)
-
-    def _f1_score(self, preds, golds):
-        """Returns the average F1-score over normalized gold answers and predictions.
-        From Section 5 of Dua et al. "DROP:...":
-        "When an answer has multiple spans, we first perform a one-to-one
-        alignment greedily based on bag-of-word overlap on the set of spans
-        and then compute average F1 over each span."
+    def get_metrics(self, predicted, gold):
+        """
+        Takes a predicted answer and a gold answer (that are both either a string or a list of
+        strings), and returns exact match and the DROP F1 metric for the prediction.  If you are
+        writing a script for evaluating objects in memory (say, the output of predictions during
+        validation, or while training), this is the function you want to call, after using
+        :func:`answer_json_to_strings` when reading the gold answer from the released data file.
        """
-        pred_bags = self._answer_to_bags(preds)
-        gold_bags = self._answer_to_bags(golds)
-        f1_per_bag = self._align_bags(pred_bags, gold_bags)
-        return np.mean(f1_per_bag)
-
-    def _answer_to_bags(self, answers):
-        return [set(self._normalize(answer).split()) for answer in answers]
-
-    def _align_bags(self, pred_bags, gold_bags):
-        """ Returns the max metric value over all the answers. """
-        scores = np.zeros([len(gold_bags), len(pred_bags)])
-        for gold_index, gold_bag in enumerate(gold_bags):
-            for pred_index, pred_bag in enumerate(pred_bags):
-                if self._is_number_match(pred_bag, gold_bag):
-                    scores[gold_index, pred_index] = self._bag_f1(pred_bag, gold_bag)
+        predicted_bags = self._answer_to_bags(predicted)
+        gold_bags = self._answer_to_bags(gold)
+
+        if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
+            exact_match = 1.0
+        else:
+            exact_match = 0.0
+
+        f1_per_bag = self._align_bags(predicted_bags[1], gold_bags[1])
+        f1 = np.mean(f1_per_bag)
+        f1 = round(f1, 2)
+        return exact_match, f1
+
+    def _answer_to_bags(self, answer):
+        if isinstance(answer, (list, tuple)):
+            raw_spans = answer
+        else:
+            raw_spans = [answer]
+        normalized_spans = []
+        token_bags = []
+        for raw_span in raw_spans:
+            normalized_span = self._normalize(raw_span)
+            normalized_spans.append(normalized_span)
+            token_bags.append(set(normalized_span.split()))
+        return normalized_spans, token_bags
+
+    def _align_bags(self, predicted, gold):
+        """
+        Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
+        between them and gets maximum metric values over all the answers.
+        """
+        scores = np.zeros([len(gold), len(predicted)])
+        for gold_index, gold_item in enumerate(gold):
+            for pred_index, pred_item in enumerate(predicted):
+                if self._match_numbers_if_present(gold_item, pred_item):
+                    scores[gold_index, pred_index] = self._compute_f1(pred_item, gold_item)
        row_ind, col_ind = linear_sum_assignment(-scores)
-        max_scores = np.zeros([max(len(gold_bags), len(pred_bags))])
+
+        max_scores = np.zeros([max(len(gold), len(predicted))])
        for row, column in zip(row_ind, col_ind):
            max_scores[row] = max(max_scores[row], scores[row, column])
        return max_scores

-    def _bag_f1(self, pred_bag, gold_bag):
-        intersection = len(gold_bag.intersection(pred_bag))
-        if intersection == 0:
-            return 0.0
-        precision = intersection / float(len(pred_bag)) if pred_bag else 1.0
-        recall = intersection / float(len(gold_bag)) if gold_bag else 1.0
-        f1 = (2 * precision * recall) / (precision + recall)
+    def _compute_f1(self, predicted_bag, gold_bag):
+        intersection = len(gold_bag.intersection(predicted_bag))
+        if not predicted_bag:
+            precision = 1.0
+        else:
+            precision = intersection / float(len(predicted_bag))
+        if not gold_bag:
+            recall = 1.0
+        else:
+            recall = intersection / float(len(gold_bag))
+        f1 = (
+            (2 * precision * recall) / (precision + recall)
+            if not (precision == 0.0 and recall == 0.0)
+            else 0.0
+        )
        return f1

-    def _is_number_match(self, pred_bag, gold_bag):
-        pred_numbers = set([word for word in pred_bag if self._is_number(word)])
-        gold_numbers = set([word for word in gold_bag if self._is_number(word)])
-        if (not gold_numbers) or gold_numbers.intersection(pred_numbers):
+    def _match_numbers_if_present(self, gold_bag, predicted_bag):
+        gold_numbers = set()
+        predicted_numbers = set()
+        for word in gold_bag:
+            if self._is_number(word):
+                gold_numbers.add(word)
+        for word in predicted_bag:
+            if self._is_number(word):
+                predicted_numbers.add(word)
+        if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
            return True
        return False

@@ -175,30 +219,29 @@ class DROP(Task):
        except ValueError:
            return False

-    def _normalize(self, answer):
-        def remove_articles(text):
-            regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-            return re.sub(regex, " ", text)
+    def _remove_articles(self, text):
+        return _ARTICLES.sub(" ", text)

-        def white_space_fix(text):
-            return " ".join(text.split())
+    def _white_space_fix(self, text):
+        return " ".join(text.split())

-        def remove_punc(text):
-            exclude = set(string.punctuation)
-            if not self._is_number(text):
-                return "".join(ch for ch in text if ch not in exclude)
-            else:
-                return text
+    def _remove_punc(self, text):
+        exclude = set(string.punctuation)
+        if not self._is_number(text):
+            return "".join(ch for ch in text if ch not in exclude)
+        else:
+            return text

-        def fix_number(text):
-            return str(float(text)) if self._is_number(text) else text
+    def _fix_number(self, text):
+        return str(float(text)) if self._is_number(text) else text

-        def tokenize(text):
-            return re.split(" |-", text)
+    def _tokenize(self, text):
+        return re.split(" |-", text)

+    def _normalize(self, answer):
        tokens = [
-            white_space_fix(remove_articles(fix_number(remove_punc(token.lower()))))
-            for token in tokenize(answer)
+            self._white_space_fix(self._remove_articles(self._fix_number(self._remove_punc(token.lower()))))
+            for token in self._tokenize(answer)
        ]
        tokens = [token for token in tokens if token.strip()]
        normalized = " ".join(tokens).strip()

--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -227,7 +227,7 @@ class QNLI(HFTask):


 class WNLI(HFTask):
-    VERSION = 0
+    VERSION = 1
    DATASET_PATH = "glue"
    DATASET_NAME = "wnli"

@@ -241,26 +241,25 @@ class WNLI(HFTask):
        return False

    def doc_to_text(self, doc):
-        return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
+        return "{}\nQuestion: {} True or False?\nAnswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )

    def doc_to_target(self, doc):
        # True = entailment
-        # False = contradiction
-        # Neither = neutral
-        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
+        # False = not_entailment
+        return " {}".format({0: "False", 1: "True"}[doc["label"]])

    def construct_requests(self, doc, ctx):
        ll_true, _ = rf.loglikelihood(ctx, " True")
-        ll_neither, _ = rf.loglikelihood(ctx, " Neither")
        ll_false, _ = rf.loglikelihood(ctx, " False")
-        return ll_true, ll_neither, ll_false
+        return ll_true, ll_false

    def process_results(self, doc, results):
+        ll_true, ll_false = results
+        pred = ll_true > ll_false
        gold = doc["label"]
-        pred = np.argmax(results)
        return {
            "acc": pred == gold
        }

--- a/lm_eval/tasks/headqa.py
+++ b/lm_eval/tasks/headqa.py
@@ -2,10 +2,9 @@ from . common import HFTask
 from lm_eval.base import MultipleChoiceTask


-class HeadQA(HFTask, MultipleChoiceTask):
+class HeadQABase(HFTask, MultipleChoiceTask):
    VERSION = 0
    DATASET_PATH = "head_qa"
-    DATASET_NAME = None

    def has_training_docs(self):
        return True
@@ -31,3 +30,15 @@ class HeadQA(HFTask, MultipleChoiceTask):

    def doc_to_text(self, doc):
        return doc["query"]
+
+class HeadQAEn(HeadQABase):
+    DATASET_NAME = "en"
+
+class HeadQAEs(HeadQABase):
+    DATASET_NAME = "es"
+
+# for backwards compatibility
+class HeadQAEsDeprecated(HeadQABase):
+    DATASET_NAME = "es"
+
+    print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
\ No newline at end of file
--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
@@ -3,6 +3,7 @@ from lm_eval.base import Task, rf
 from lm_eval.metrics import mean, perplexity
 from lm_eval.utils import sh
 from best_download import download_file
+import os


 class LAMBADA(Task):
@@ -10,11 +11,12 @@ class LAMBADA(Task):
    def download(self):
        sh("mkdir -p data/lambada")
        try:
-            download_file(
-                "http://eaidata.bmk.sh/data/lambada_test.jsonl", 
-                "data/lambada/lambada_test.jsonl", 
-                "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
-            )
+            if not os.path.exists("data/lambada/lambada_test.jsonl"):
+                download_file(
+                    "http://eaidata.bmk.sh/data/lambada_test.jsonl", 
+                    "data/lambada/lambada_test.jsonl", 
+                    "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
+                )
        except:
            # fallback - for some reason best_download doesnt work all the time here
            sh("wget http://eaidata.bmk.sh/data/lambada_test.jsonl -O data/lambada/lambada_test.jsonl")

--- a/lm_eval/tasks/lambada_multilingual.py
+++ b/lm_eval/tasks/lambada_multilingual.py
+from . import lambada
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean, perplexity
+from lm_eval.utils import sh
+from best_download import download_file
+import json
+from functools import partial
+import os 
+
+# This task is lambada but machine-translated to the other languages.
+
+LANGS = ["en", "fr", "de", "it", "es"]
+CHECKSUMS = {"en": "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226", 
+             "fr": "941ec6a73dba7dc91c860bf493eb66a527cd430148827a4753a4535a046bf362", 
+             "de": "51c6c1795894c46e88e4c104b5667f488efe79081fb34d746b82b8caa663865e", 
+             "it": "86654237716702ab74f42855ae5a78455c1b0e50054a4593fb9c6fcf7fad0850", 
+             "es": "ffd760026c647fb43c67ce1bc56fd527937304b348712dce33190ea6caba6f9c"
+            }
+
+class MultilingualLAMBADA(lambada.LAMBADA):
+    VERSION = 0
+    
+    def __init__(self, lang=None):
+      self.LANG = lang
+      super().__init__()
+    
+    def download(self):
+      sh("mkdir -p data/lambada")
+      f = f"data/lambada/lambada_test_{self.LANG}.jsonl"
+      url = f"http://eaidata.bmk.sh/data/lambada_test_{self.LANG}.jsonl"
+      try:
+        if not os.path.exists(f):
+          download_file(
+              url, 
+              f, 
+              CHECKSUMS[self.LANG]
+          )
+      except:
+        # fallback - for some reason best_download doesnt work all the time here
+        sh(f"wget {url} -O {f}")
+        sh(f'echo "{CHECKSUMS[self.LANG]}  {f}" | sha256sum --check')
+
+
+    def validation_docs(self):
+      with open(f"data/lambada/lambada_test_{self.LANG}.jsonl") as fh:
+        for line in fh:
+          yield json.loads(line)
+
+class MultilingualLAMBADAEN(MultilingualLAMBADA):
+  def __init__(self):
+    super().__init__('en')
+
+class MultilingualLAMBADAFR(MultilingualLAMBADA):
+  def __init__(self):
+    super().__init__('fr')
+
+class MultilingualLAMBADADE(MultilingualLAMBADA):
+  def __init__(self):
+    super().__init__('de')
+
+class MultilingualLAMBADAIT(MultilingualLAMBADA):
+  def __init__(self):
+    super().__init__('it')
+
+class MultilingualLAMBADAES(MultilingualLAMBADA):
+  def __init__(self):
+    super().__init__('es')
+
+LANG_CLASSES = [MultilingualLAMBADAEN, MultilingualLAMBADAFR, MultilingualLAMBADADE, MultilingualLAMBADAIT, MultilingualLAMBADAES]
+
+def construct_tasks():
+    tasks = {}
+    for lang, lang_class in zip(LANGS, LANG_CLASSES):
+        tasks[f"lambada_mt_{lang}"] = lang_class
+    return tasks
+
--- a/lm_eval/tasks/logiqa.py
+++ b/lm_eval/tasks/logiqa.py
@@ -10,7 +10,7 @@ class LogiQA(MultipleChoiceTask):
    def download(self):
        if self.DATASET_PATH.exists():
            return
-        Path.mkdir(self.DATASET_PATH)
+        Path.mkdir(self.DATASET_PATH, parents=True)
        base_url = "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master"
        splits = [
            {"name": "Train", "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"},

--- a/lm_eval/tasks/mc_taco.py
+++ b/lm_eval/tasks/mc_taco.py
+"""
+“Going on a vacation” takes longer than “Going for a walk”:
+A Study of Temporal Commonsense Understanding
+https://arxiv.org/pdf/1909.03065.pdf
+
+WARNING: Running this task with a `--limit` arg will give misleading results! The 
+corresponding dataset is structured such that each multiple-choice-question gathered
+by the authors is split into question-option pairs, where each such pair gets 
+siloed into an individual document for plausibility testing. Because the harness
+shuffles these documents, setting `--limit` will likely "cut off" certain candidate
+answers. This is a problem because the task's metrics require an exhaustive evaluation 
+of a question's options. See section 4 of the paper for details.
+
+@inproceedings{ZKNR19,
+    author = {Ben Zhou, Daniel Khashabi, Qiang Ning and Dan Roth},
+    title = {“Going on a vacation” takes longer than “Going for a walk”: A Study of Temporal Commonsense Understanding },
+    booktitle = {EMNLP},
+    year = {2019},
+}
+"""
+
+import numpy as np
+from lm_eval.base import rf
+from collections import defaultdict
+from . common import HFTask
+
+
+class MCTACO(HFTask):
+    VERSION = 0
+    DATASET_PATH = "mc_taco"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def fewshot_description(self):
+        return "Determine whether the candidate answer is plausible (\"yes\") or not (\"no\")"
+
+    def doc_to_text(self, doc):
+        return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
+            f"Answer: {doc['answer']}\nPlausible:"
+
+    def doc_to_target(self, doc):
+        return " " + ["no", "yes"][doc['label']]
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        ll_no, _ = rf.loglikelihood(ctx, " no")
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        return ll_no, ll_yes
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        ll_no, ll_yes = results
+        gold = doc['label']
+        pred = int(ll_yes > ll_no)
+        question_id = self._question2id(doc)
+        items = (gold, pred, question_id)
+        return {
+            "em": items,
+            "f1": items
+        }
+
+    def _question2id(self, doc):
+        """ Returns an identifier for the question in the given document. """
+        return " ".join([doc['sentence'], doc['question']])
+
+    def aggregation(self):
+        return {
+            "f1": f1,
+            "em": exact_match,
+        }
+
+    def higher_is_better(self):
+        return {
+            "f1": True,
+            "em": True,
+        }
+
+
+def exact_match(items):
+    """
+    Counts a question as correct if the model accurately classifies the plausibility
+    of an answer for all candidate answers. See section 4 "Evaluation Metrics" in the paper.
+    """
+    results = list(zip(*items))
+    accuracies = defaultdict(list)
+    for gold, pred, question in zip(results[0], results[1], results[2]):
+        accuracies[question].append(pred == gold)
+    return np.mean([int(all(accs)) for accs in accuracies.values()])
+
+
+def f1(items):
+    """ See section 4 "Evaluation Metrics" in the paper about the F1 metric used. """
+    results = list(zip(*items))
+    # Group the positive ("yes" = 1) golds and predictions by question.
+    gold_positives, pred_positives = defaultdict(list), defaultdict(list)
+    for gold, pred, question in zip(results[0], results[1], results[2]):
+        gold_positives[question].append(gold)
+        pred_positives[question].append(pred)
+    f1 = []
+    for question in gold_positives.keys():
+        gp, pp = sum(gold_positives[question]), sum(pred_positives[question])
+        tp = sum(np.logical_and(gold_positives[question], pred_positives[question]))
+        p = tp / pp if pp > 0.0 else 1.0
+        r = tp / gp if gp > 0.0 else 1.0
+        if p + r > 0.0:
+            f1.append(2. * (p * r) / (p + r))
+    return np.mean(f1)
--- a/lm_eval/tasks/mutual.py
+++ b/lm_eval/tasks/mutual.py
+"""
+MuTual: A Dataset for Multi-Turn Dialogue Reasoning
+https://www.aclweb.org/anthology/2020.acl-main.130/
+
+@inproceedings{mutual,
+    title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
+    author = "Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
+    booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+}
+"""
+import json
+import zipfile
+import shutil
+import numpy as np
+from pathlib import Path
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean
+from best_download import download_file
+
+
+class MuTualBase(Task):
+    VERSION = 1
+    BASE_PATH = Path("data/mutual")
+    DATASET_NAME = None
+    CHOICES = ['A', 'B', 'C', 'D']
+
+    def __init__(self):
+        super().__init__()
+
+    def download(self):
+        if self.BASE_PATH.exists():
+            return
+        Path.mkdir(self.BASE_PATH, parents=True)
+        master_zip = Path("data/master.zip")
+        download_file(
+            "https://github.com/Nealcly/MuTual/archive/master.zip",
+            str(master_zip),
+            "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
+        with zipfile.ZipFile(master_zip, 'r') as zip:
+            zip.extractall("data")
+        Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
+        # Remove left over files and directories.
+        master_zip.unlink()
+        shutil.rmtree("data/MuTual-master")
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def _load_docs(self, path):
+        for file in sorted(path.iterdir()):
+            if file.suffix != ".txt":
+                continue
+            with open(file, 'r', encoding='utf-8') as f:
+                yield json.load(f)
+
+    def training_docs(self):
+        return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "train")
+
+    def validation_docs(self):
+        return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "dev")
+
+    def test_docs(self):
+        return NotImplemented
+
+    def fewshot_description(self):
+        # TODO: figure out fewshot description
+        return ""
+
+    def doc_to_text(self, doc):
+        return self.detokenize(doc["article"])
+
+    def doc_to_target(self, doc):
+        return " " + self.detokenize(doc["options"][self.CHOICES.index(doc["answers"])])
+
+    def construct_requests(self, doc, ctx):
+        lls = []
+        for option in doc["options"]:
+            lls.append(rf.loglikelihood(ctx, f" {self.detokenize(option)}")[0])
+        return lls
+
+    def detokenize(self, text):
+        text = text.replace(" '", "'")
+        text = text.replace(" \n", "\n")
+        text = text.replace("\n ", "\n")
+        text = text.replace(" n't", "n't")
+        text = text.replace("`` ", '"')
+        text = text.replace("''", '"')
+        # punctuation
+        text = text.replace(" :", ":")
+        text = text.replace(" ;", ";")
+        text = text.replace(" !", "!")
+        text = text.replace(" ?", "?")
+        text = text.replace(" ,", ",")
+        text = text.replace(" .", ".")
+        return text
+
+    def process_results(self, doc, results):
+        gold = self.CHOICES.index(doc["answers"])
+        r4_1 = np.argmax(results) == gold  # r4_1 = accuracy
+        ranks = sorted(results, reverse=True)
+        r4_2 = (ranks.index(results[gold]) == 1) + r4_1
+        mrr = 1. / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
+        return {
+            "r@1": r4_1,
+            "r@2": r4_2,
+            "mrr": mrr
+        }
+
+    def aggregation(self):
+        return {
+            "r@1": mean,
+            "r@2": mean,
+            "mrr": mean
+        }
+
+    def higher_is_better(self):
+        return {
+            "r@1": True,
+            "r@2": True,
+            "mrr": True
+        }
+
+
+class MuTual(MuTualBase):
+    DATASET_NAME = Path("mutual")
+
+
+class MuTualPlus(MuTualBase):
+    DATASET_NAME = Path("mutual_plus")