Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness...

Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness into evaluator-description-option

Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness...
Merge branch 'master' of https://github.com/EleutherAI/lm-evaluation-harness into evaluator-description-option
ff314d62 · Jonathan Tow · 564e0612 · df5d7cf0 · ff314d62 · ff314d62
Commit ff314d62 authored Dec 15, 2021 by Jonathan Tow
20 changed files
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
 import abc
-import random
+from typing import Iterable
 import numpy as np
 import re
-from lm_eval import tasks
+import os
+import json
+import hashlib
+from sqlitedict import SqliteDict
+from tqdm import tqdm
+import torch
+import torch.nn.functional as F

-from lm_eval.metrics import mean, perplexity, weighted_perplexity, weighted_mean
+from lm_eval.metrics import mean, weighted_perplexity, weighted_mean
+from lm_eval import utils
+from abc import abstractmethod


 class LM(abc.ABC):
    def __init__(self):
        self.cache_hook = CacheHook(None)

-    @abc.abstractmethod
+    @abstractmethod
    def loglikelihood(self, requests):
        """Compute log-likelihood of generating a continuation from a context.
        Downstream tasks should attempt to use loglikelihood instead of other 
@@ -35,7 +43,7 @@ class LM(abc.ABC):
        """
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def loglikelihood_rolling(self, requests):
        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
        - We will use the full max context length of the model.
@@ -78,7 +86,7 @@ class LM(abc.ABC):
        pass

    # TODO: Add an optional max length
-    @abc.abstractmethod
+    @abstractmethod
    def greedy_until(self, requests):
        """Generate greedily until a stopping sequence

@@ -97,18 +105,235 @@ class LM(abc.ABC):
        pass

    @classmethod
-    def create_from_arg_string(cls, arg_string):
-        """Constructor method, in case models need additional arguments
-        e.g. OpenAI API engine, paths for loading, other params
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
+
+    def set_cache_hook(self, cache_hook):
+        self.cache_hook = cache_hook

-        :param arg_string: str
-            Left up to individual model class to handle

+class BaseLM(LM):
+
+    @property
+    @abstractmethod
+    def eot_token_id(self):
+        pass
+
+    @property
+    @abstractmethod
+    def max_length(self):
+        pass
+
+    @property
+    @abstractmethod
+    def max_gen_toks(self):
+        pass
+
+    @property
+    @abstractmethod
+    def batch_size(self):
+        pass
+
+    @property
+    @abstractmethod
+    def device(self):
+        pass
+
+    @abstractmethod
+    def tok_encode(self, string: str): pass
+    
+    @abstractmethod
+    def tok_decode(self, tokens: Iterable[int]): pass
+
+    @abstractmethod
+    def _model_generate(self, context, max_length, eos_token_id): pass
+
+    @abstractmethod
+    def _model_call(self, inps):
        """
-        return cls()
+        inps: a torch tensor of shape [batch, sequence]
+        the size of sequence may vary from call to call

-    def set_cache_hook(self, cache_hook):
-        self.cache_hook = cache_hook
+        returns: a torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model
+        """
+        pass
+
+    # subclass must implement properties vocab_size, eot_token_id, max_gen_toks, batch_size, device, max_length.
+    # TODO: enforce this somehow
+
+    def loglikelihood(self, requests):
+        new_reqs = []
+        for context, continuation in requests:
+            if context == "":
+                # end of text as context
+                context_enc = [self.eot_token_id]
+            else:
+                context_enc = self.tok_encode(context)
+
+            continuation_enc = self.tok_encode(continuation)
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+
+        return self._loglikelihood_tokens(new_reqs)
+
+    def loglikelihood_rolling(self, requests):
+        # TODO: Implement caching once we've confirmed the perplexity implementation
+        # TODO: automatic batch size detection for vectorization
+
+        loglikelihoods = []
+        for string, in tqdm(requests):
+            rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
+                token_list=self.tok_encode(string),
+                prefix_token=self.eot_token_id,
+                max_seq_len=self.max_length,
+                context_len=1,
+            )))
+
+            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+            # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
+            # that
+            string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
+            
+            # discard is_greedy
+            string_nll = [x[0] for x in string_nll]
+            
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+
+        return loglikelihoods
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+        
+        # TODO: automatic (variable) batch size detection for vectorization
+        reord = utils.Reorderer(requests, _collate)
+        for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
+            inps = []
+            cont_toks_list = []
+            inplens = []
+
+            padding_length = None
+
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+
+                # how this all works:
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # gpt2    \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+
+                # when too long to fit in context, truncate from the left
+                inp = torch.tensor(
+                    (context_enc + continuation_enc)[-(self.max_length+1):][:-1],
+                    dtype=torch.long
+                ).to(self.device)
+                inplen, = inp.shape
+
+                cont = continuation_enc
+
+                # since in _collate we make sure length is descending, the longest is always the first one.
+                padding_length = padding_length if padding_length is not None else inplen
+
+                # pad length from seq to padding_length
+                inp = torch.cat([
+                    inp,  # [seq]
+                    torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device)  # [padding_length - seq]
+                ], dim=0)
+
+                inps.append(inp.unsqueeze(0))  # [1, padding_length]
+                cont_toks_list.append(cont)
+                inplens.append(inplen)
+
+            batched_inps = torch.cat(inps, dim=0)  # [batch, padding_length
+            multi_logits = F.log_softmax(self._model_call(batched_inps), dim=-1).cpu()  # [batch, padding_length, vocab]
+
+            for (cache_key, _, _), logits, inp, inplen, cont_toks \
+                    in zip(chunk, multi_logits, inps, inplens, cont_toks_list):
+
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                logits = logits[inplen-contlen:inplen].unsqueeze(0)  # [1, seq, vocab]
+
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+                cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)  # [1, seq]
+                max_equal = (greedy_tokens == cont_toks).all()
+
+                # Obtain log-probs at the corresponding continuation token indices
+                # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)  # [1, seq]
+
+                # Answer: (log prob, is-exact-match)
+                answer = (float(logits.sum()), bool(max_equal))
+
+                # partial caching
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+
+                res.append(answer)
+
+        return reord.get_original(res)
+    
+    def greedy_until(self, requests):
+        # TODO: implement fully general `until` that handles untils that are 
+        #       multiple tokens or that span multiple tokens correctly
+
+        # TODO: extract to TokenizedLM?
+        res = []
+
+        def _collate(x):
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
+        
+        reord = utils.Reorderer(requests, _collate)
+
+        for context, until in tqdm(reord.get_reordered()):
+            if isinstance(until, str):
+                until = [until]
+
+            primary_until, = self.tok_encode(until[0])
+            
+            context_enc = torch.tensor([self.tok_encode(context)[self.max_gen_toks - self.max_length:]]).to(self.device)
+
+            cont = self._model_generate(context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until)
+
+            s = self.tok_decode(cont[0].tolist()[context_enc.shape[1]:])
+
+            for term in until:
+                s = s.split(term)[0]
+            
+            # partial caching
+            self.cache_hook.add_partial("greedy_until", (context, until), s)
+            
+            res.append(s)
+        
+        return reord.get_original(res)


 class Task(abc.ABC):
@@ -129,17 +354,17 @@ class Task(abc.ABC):
        """Downloads the task dataset if necessary"""
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def has_training_docs(self):
        """Whether the task has a training set"""
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def has_validation_docs(self):
        """Whether the task has a validation set"""
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def has_test_docs(self):
        """Whether the task has a test set"""
        pass
@@ -171,15 +396,15 @@ class Task(abc.ABC):

        return rnd.sample(self._training_docs, k)

-    @abc.abstractmethod
+    @abstractmethod
    def doc_to_text(self, doc):
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def doc_to_target(self, doc):
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def construct_requests(self, doc, ctx):
        """ Uses RequestFactory to construct Requests and returns an iterable of 
        Requests which will be sent to the LM.
@@ -193,7 +418,7 @@ class Task(abc.ABC):
        """
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a 
        dict where keys are the names of submetrics and values are the values of 
@@ -206,7 +431,7 @@ class Task(abc.ABC):
        """
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def aggregation(self):
        """
        :returns: {str: [metric_score] -> float}
@@ -215,7 +440,7 @@ class Task(abc.ABC):
        """
        pass

-    @abc.abstractmethod
+    @abstractmethod
    def higher_is_better(self):
        """
        :returns: {str: bool}
@@ -243,7 +468,9 @@ class Task(abc.ABC):
                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
            else:
                if self._fewshot_docs is None:
-                    self._fewshot_docs = list(self.validation_docs() if self.has_validation_docs() else self.test_docs())
+                    self._fewshot_docs = list(
+                        self.validation_docs() if self.has_validation_docs() else self.test_docs()
+                    )

                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)

@@ -258,7 +485,7 @@ class Task(abc.ABC):
        return description + labeled_examples + example


-class MultipleChoiceTask(Task):
+class MultipleChoiceTask(Task, abc.ABC):
    def doc_to_target(self, doc):
        return " " + doc['choices'][doc['gold']]

@@ -330,10 +557,10 @@ class PerplexityTask(Task, abc.ABC):
    def process_results(self, doc, results):
        loglikelihood, = results
        words = self.count_words(doc)
-        bytes = self.count_bytes(doc)
+        bytes_ = self.count_bytes(doc)
        return {
            "word_perplexity": (loglikelihood, words),
-            "byte_perplexity": (loglikelihood, bytes),
+            "byte_perplexity": (loglikelihood, bytes_),
            "bits_per_byte": (-loglikelihood, self.count_bytes(doc))
        }

@@ -344,25 +571,16 @@ class PerplexityTask(Task, abc.ABC):
            "bits_per_byte": weighted_mean
        }

-    def count_bytes(self, doc):
+    @classmethod
+    def count_bytes(cls, doc):
        return len(doc.encode("utf-8"))

-    def count_words(self, doc):
+    @classmethod
+    def count_words(cls, doc):
        """ Downstream tasks with custom word boundaries should override this! """
        return len(re.split(r"\s+", doc))


-req_ret_lens = {
-    'loglikelihood': 2,
-    'greedy_until': None,
-    'loglikelihood_rolling': None,
-}
-
-import os
-import json
-import hashlib
-from sqlitedict import SqliteDict
-
 def hash_args(attr, args):
    dat = json.dumps([attr] + list(args))
    return hashlib.sha256(dat.encode('utf-8')).hexdigest()
@@ -385,9 +603,17 @@ class CacheHook:

 class CachingLM:
    def __init__(self, lm, cache_db):
+        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
+
+        :param lm: LM
+            Underlying LM
+        :param cache_db: str
+            Path to cache db
+        """
        self.lm = lm
        self.cache_db = cache_db
-        if os.path.dirname(cache_db): os.makedirs(os.path.dirname(cache_db), exist_ok=True)
+        if os.path.dirname(cache_db):
+            os.makedirs(os.path.dirname(cache_db), exist_ok=True)
        self.dbdict = SqliteDict(cache_db, autocommit=True)

        # add hook to lm
@@ -411,13 +637,14 @@ class CachingLM:
                    res.append(None)
                    remaining_reqs.append(req)
            
-            # actually run the LM
+            # actually run the LM on the requests that do not have cached results
            rem_res = getattr(self.lm, attr)(remaining_reqs)

            # stick the new ones back into the list and also cache any of the new ones
            resptr = 0
            for req, r in zip(remaining_reqs, rem_res):
-                while res[resptr] is not None: resptr += 1
+                while res[resptr] is not None:
+                    resptr += 1

                res[resptr] = r

@@ -433,32 +660,39 @@ class CachingLM:
        return CacheHook(self)


+REQUEST_RETURN_LENGTHS = {
+    'loglikelihood': 2,
+    'greedy_until': None,
+    'loglikelihood_rolling': None,
+}
+
+
 class Request:
-    def __init__(self, type, args, index=None):
-        if type not in req_ret_lens.keys():
-            raise NotImplementedError('The request type {} is not implemented!'.format(type))
+    def __init__(self, request_type, args, index=None):
+        if request_type not in REQUEST_RETURN_LENGTHS.keys():
+            raise NotImplementedError('The request type {} is not implemented!'.format(request_type))

-        self.type = type
+        self.request_type = request_type
        self.args = args
        self.index = index
    
    def __iter__(self):
-        if req_ret_lens[self.type] is None:
+        if REQUEST_RETURN_LENGTHS[self.request_type] is None:
            raise IndexError('This request type does not return multiple arguments!')
-        i = 0
-        for i in range(req_ret_lens[self.type]):
-            yield Request(self.type, self.args, i)
+        for i in range(REQUEST_RETURN_LENGTHS[self.request_type]):
+            yield Request(self.request_type, self.args, i)
    
    def __getitem__(self, i):
-        if req_ret_lens[self.type] is None:
+        if REQUEST_RETURN_LENGTHS[self.request_type] is None:
            raise IndexError('This request type does not return multiple arguments!')
-        return Request(self.type, self.args, i)
+        return Request(self.request_type, self.args, i)
    
    def __eq__(self, other):
-        return self.type == other.type and self.args == other.args and self.index == other.index
+        return self.request_type == other.request_type and self.args == other.args and self.index == other.index

    def __repr__(self):
-        return f"Req_{self.type}{self.args}[{self.index}]\n"
+        return f"Req_{self.request_type}{self.args}[{self.index}]\n"
+

 class RequestFactory:
    def __getattr__(self, attr):

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -8,7 +8,33 @@ import lm_eval.tasks
 import lm_eval.base
 import numpy as np

-def simple_evaluate(model, model_args, task_names, description_path=None, num_fewshot=0, batch_size=None, device=None, no_cache=False, limit=None, bootstrap_iters=100000):
+
+def simple_evaluate(model, model_args, task_names,
+                    num_fewshot=0, batch_size=None, device=None,
+                    no_cache=False, limit=None, bootstrap_iters=100000):
+    """Instantiate and evaluate a model on a list of tasks.
+
+    :param model: str
+        Name of model, see lm_eval.models.get_model
+    :param model_args: str
+        String arguments for each model class, see LM.create_from_arg_string
+    :param task_names: list[str]
+        List of task names
+    :param num_fewshot: int
+        Number of examples in few-shot context
+    :param batch_size: int, optional
+        Batch size for model
+    :param device: str, optional
+        PyTorch device (e.g. "cpu" or "cuda:0") for running models
+    :param no_cache: bool
+        Whether or not to cache
+    :param limit: int, optional
+        Limit the number of examples per task (only use this for testing)
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics
+    :return
+        Dictionary of results
+    """
    random.seed(1234)
    np.random.seed(1234)

@@ -17,7 +43,9 @@ def simple_evaluate(model, model_args, task_names, description_path=None, num_fe
    })

    if not no_cache:
-        lm = lm_eval.base.CachingLM(lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db')
+        lm = lm_eval.base.CachingLM(
+            lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db'
+        )
    
    task_dict = lm_eval.tasks.get_task_dict(task_names)
    description_dict = {}
@@ -44,10 +72,34 @@ def simple_evaluate(model, model_args, task_names, description_path=None, num_fe
    return results


-def evaluate(lm, task_dict, num_fewshot, limit, description_dict=None, bootstrap_iters=100000):
+def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000):
+    """Instantiate and evaluate a model on a list of tasks.
+
+    :param lm: obj
+        Language Model
+    :param task_dict: dict[str, Task]
+        Dictionary of tasks
+    :param provide_description: bool
+        Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
+    :param num_fewshot: int
+        Number of examples in few-shot context
+    :param limit: int, optional
+        Limit the number of examples per task (only use this for testing)
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics
+    :return
+        Dictionary of results
+    """
    # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces

-    task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
+    # TODO: todo: implement proper description-providing system
+    assert not provide_description  # not implemented.
+
+    task_dict_items = [
+        (name, task)
+        for name, task in task_dict.items()
+        if(task.has_validation_docs() or task.has_test_docs())
+    ]

    results = collections.defaultdict(dict)
    versions = collections.defaultdict(dict)
@@ -55,23 +107,25 @@ def evaluate(lm, task_dict, num_fewshot, limit, description_dict=None, bootstrap
    requests = collections.defaultdict(list)
    requests_origin = collections.defaultdict(list)

-    # if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
-    # we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
-    # (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
+    # If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger
+    # memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because
+    # over-engineering is bad (or we could make it write the requests to disk and then read them back out again
+    #  - probably using an sqlite db because of all the moving parts we have

    # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
-
    docs = {}

-    # get lists of each type of requeste
+    # get lists of each type of request
    for task_name, task in task_dict_items:
        versions[task_name] = task.VERSION
-        #default to test doc, fall back to val doc if validation unavailable
+        # default to test doc, fall back to val doc if validation unavailable
        # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
        if task.has_test_docs():
            task_doc_func = task.test_docs
        elif task.has_validation_docs():
            task_doc_func = task.validation_docs
+        else:
+            raise RuntimeError("Task has neither test_docs nor validation_docs")

        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
        task_docs = list(task_doc_func())
@@ -90,25 +144,26 @@ def evaluate(lm, task_dict, num_fewshot, limit, description_dict=None, bootstrap
                description=description
            )
            reqs = task.construct_requests(doc, ctx)
-            if not isinstance(reqs, (list, tuple)): reqs = [reqs]
+            if not isinstance(reqs, (list, tuple)):
+                reqs = [reqs]
            for i, req in enumerate(reqs):
-                requests[req.type].append(req)
+                requests[req.request_type].append(req)
                # i: index in requests for a single task instance
                # doc_id: unique id that we can get back to a doc using `docs`
-                requests_origin[req.type].append((i, task_name, doc, doc_id))
+                requests_origin[req.request_type].append((i, task_name, doc, doc_id))

    # all responses for each (task, doc)
    process_res_queue = collections.defaultdict(list)

    # execute each type of request
    for reqtype, reqs in requests.items():
-        # TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
-        # only in index. We could implement some kind of caching, but that would be more of a bandaid
-        # solution. we could also implement some kind of autogrouping here; they should end up next to each other.
+        # TODO: right now, this code runs multiple separate LM requests for multiple Requests differing
+        #       only in index. We could implement some kind of caching, but that would be more of a band-aid
+        #       solution. we could also implement some kind of auto-grouping here;
+        #       they should end up next to each other.

        print("Running", reqtype, "requests")
        resps = getattr(lm, reqtype)([req.args for req in reqs])
-
        resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]

        for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
@@ -135,7 +190,10 @@ def evaluate(lm, task_dict, num_fewshot, limit, description_dict=None, bootstrap

        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
        # so we run them less iterations. still looking for a cleaner way to do this
-        stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters)
+        stderr = lm_eval.metrics.stderr_for_metric(
+            metric=task.aggregation()[metric],
+            bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
+        )
        if stderr is not None:
            results[task_name][metric + "_stderr"] = stderr(items)
    
@@ -146,6 +204,7 @@ def evaluate(lm, task_dict, num_fewshot, limit, description_dict=None, bootstrap


 def make_table(result_dict):
+    """Generate table of results."""
    from pytablewriter import MarkdownTableWriter, LatexTableWriter

    md_writer = MarkdownTableWriter()
@@ -158,11 +217,11 @@ def make_table(result_dict):
    for k, dic in result_dict["results"].items():
        version = result_dict["versions"][k]
        for m, v in dic.items():
-            if m.endswith("_stderr"): continue
+            if m.endswith("_stderr"):
+                continue

            if m + "_stderr" in dic:
                se = dic[m + "_stderr"]
-
                values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
            else:
                values.append([k, version, m, '%.4f' % v, '', ''])

--- a/lm_eval/metrics.py
+++ b/lm_eval/metrics.py
 import math
-from collections import Iterable
-from pprint import pprint
+from collections.abc import Iterable

 import numpy as np
 import sacrebleu
@@ -63,6 +62,7 @@ def acc_all(items):
    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
    return acc

+
 def acc_all_stderr(items):
    # Only count as correct if all answers are labeled correctly for each question
    question_scoring_dict = {}
@@ -98,6 +98,7 @@ def weighted_mean(items):
    a, b = zip(*items)
    return sum(a) / sum(b)

+
 def weighted_perplexity(items):
    return math.exp(-weighted_mean(items))

@@ -179,12 +180,13 @@ def _sacreformat(refs, preds):

    return refs, preds

-## stderr stuff
+# stderr stuff

 class _bootstrap_internal:
    def __init__(self, f, n):
        self.f = f
        self.n = n
+
    def __call__(self, v):
        i, xs = v
        rnd = random.Random()
@@ -208,7 +210,9 @@ def bootstrap_stderr(f, xs, iters):
    chunk_size = min(1000, iters)
    from tqdm import tqdm
    print("bootstrapping for stddev:", f.__name__)
-    for bootstrap in tqdm(pool.imap(_bootstrap_internal(f, chunk_size), [(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
+    for bootstrap in tqdm(pool.imap(
+            _bootstrap_internal(f, chunk_size),
+            [(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
        # sample w replacement
        res.extend(bootstrap)


--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -3,6 +3,7 @@ from . import gpt3
 from . import dummy

 MODEL_REGISTRY = {
+    "hf": gpt2.HFLM,
    "gpt2": gpt2.GPT2LM,
    "gpt3": gpt3.GPT3LM,
    "dummy": dummy.DummyLM,

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
 import transformers
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from lm_eval.base import LM
-from lm_eval import utils
-from tqdm import tqdm
-import numpy as np
+from lm_eval.base import BaseLM


-class GPT2LM(LM):
-    MAX_GEN_TOKS = 256
+class HFLM(BaseLM):

    def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
        super().__init__()
@@ -19,183 +13,71 @@ class GPT2LM(LM):
        assert isinstance(batch_size, int)

        if device:
-            self.device = torch.device(device)
+            self._device = torch.device(device)
        else:
-            self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+            self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

        # TODO: update this to be less of a hack once subfolder is fixed in HF
-        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(pretrained, revision=revision +("/" + subfolder if subfolder is not None else "")).to(self.device)
+        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
+            pretrained, revision=revision + ("/" + subfolder if subfolder is not None else "")
+        ).to(self.device)
        self.gpt2.eval()

-        # pretrained tokenizer for neo is broken for now so just hardcoding this to gpt2
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
+        # pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)

        assert isinstance(self.tokenizer, (
            transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
            transformers.T5Tokenizer, transformers.T5TokenizerFast,
        )), "this tokenizer has not been checked for compatibility yet!"

-        self.VOCAB_SIZE = self.tokenizer.vocab_size
-        self.EOT_TOKEN_ID = self.tokenizer.eos_token_id
-        print(self.EOT_TOKEN_ID)
-
-        try:
-            self.max_length = self.gpt2.config.n_ctx
-        except AttributeError:
-            # gptneoconfig doesn't have n_ctx apparantly
-            self.max_length = self.gpt2.config.max_position_embeddings
+        self.vocab_size = self.tokenizer.vocab_size

        if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
-            assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373]
+            assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373], \
+                self.tokenizer.encode('hello\n\nhello')

        # multithreading and batching
-        gpus = torch.cuda.device_count()
-        batch_size_per_gpu = batch_size # todo: adaptive batch size
-
-        # TODO: fix multi-gpu
-        self.batch_size = batch_size_per_gpu# * gpus
+        self.batch_size_per_gpu = batch_size  # todo: adaptive batch size

        # TODO: fix multi-gpu
+        # gpus = torch.cuda.device_count()
        # if gpus > 1:
        #     self.gpt2 = nn.DataParallel(self.gpt2)

-    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config={}):
-        args = utils.simple_parse_args_string(arg_string)
-        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        return cls(**args, **args2)
-
-    def loglikelihood(self, requests):
-        new_reqs = []
-        for context, continuation in requests:
-            if context == "":
-                # end of text as context
-                context_enc = [self.EOT_TOKEN_ID]
-            else:
-                context_enc = self.tokenizer.encode(context, add_special_tokens=False)
-
-            continuation_enc = self.tokenizer.encode(continuation, add_special_tokens=False)
-
-            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-
-        return self._loglikelihood_tokens(new_reqs)
-
-    def loglikelihood_rolling(self, requests):
-        # TODO: Implement caching once we've confirmed the perplexity implementation
-        # TODO: automatic batch size detection for vectorization
-
-        loglikelihoods = []
-        with torch.no_grad():
-            for string, in tqdm(requests):
-                rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
-                    token_list=self.tokenizer.encode(string, add_special_tokens=False),
-                    prefix_token=self.EOT_TOKEN_ID,
-                    max_seq_len=self.max_length,
-                    context_len=1,
-                )))
-
-                rolling_token_windows = [(None,) + x for x in rolling_token_windows]
-
-                # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
-                string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
-                
-                # discard is_greedy
-                string_nll = [x[0] for x in string_nll]
-                
-                string_nll = sum(string_nll)
-                loglikelihoods.append(string_nll)
-
-        return loglikelihoods
-
-    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
-        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
-        res = []
-        with torch.no_grad():
-
-            def _collate(x):
-                # the negative sign on len(toks) sorts descending - this has a few advantages:
-                # - time estimates will always be over not underestimates, which is more useful for planning
-                # - to know the size of a batch when going through the list, you know the first one is always the batch padded context length.
-                #   this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement
-                # - any OOMs will happen right away rather than near the end
-
-                toks = x[1] + x[2]
-                return (-len(toks), tuple(toks))
-            
-            # TODO: automatic (variable) batch size detection for vectorization
-            reord = utils.Reorderer(requests, _collate)
-            for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
-                inps = []
-                contlens = []
-                inplens = []
-
-                padding_length = None
-
-                # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
-                # tensors, then we pack them together into a batch, call the model, and then pick it all apart
-                # again because vectorizing is annoying
-
-                for _, context_enc, continuation_enc in chunk:
-                    # sanity check
-                    assert len(context_enc) > 0
-                    assert len(continuation_enc) > 0
-                    assert len(continuation_enc) <= self.max_length
-
-                    # how this all works:
-                    #          CTX      CONT
-                    # inp    0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
-                    # gpt2    \               \
-                    # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice
-                    # cont_toks      4 5 6 7 8 9
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id

-                    # when too long to fit in context, truncate from the left
-                    inp = torch.tensor(
-                        (context_enc + continuation_enc)[-(self.max_length+1):][:-1]
-                    , dtype=torch.long).to(self.device)
-                    inplen, = inp.shape
-
-                    cont = continuation_enc
-
-                    # since in _collate we make sure length is descending, the longest is always the first one.
-                    padding_length = padding_length if padding_length is not None else inplen
-
-                    # pad to length
-                    inp = torch.cat([
-                        inp, # [seq]
-                        torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
-                    ], dim=0)
-
-                    inps.append(inp.unsqueeze(0))
-                    contlens.append(cont)
-                    inplens.append(inplen)
-
-                multi_logits = F.log_softmax(self._model_call(torch.cat(inps, dim=0)), dim=-1).cpu()  # [batch, seq, vocab]
-
-                for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
-                    contlen = len(cont_toks)
-
-                    logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
-
-                    greedy_tokens = logits.argmax(dim=-1)
-
-                    # cont_toks :: [1, seq]
-                    cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)
-
-                    max_equal = (greedy_tokens == cont_toks).all()
-
-                    #last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+    @property
+    def max_length(self):
+        try:
+            return self.gpt2.config.n_ctx
+        except AttributeError:
+            # gptneoconfig doesn't have n_ctx apparently
+            return self.gpt2.config.max_position_embeddings

-                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
+    @property
+    def max_gen_toks(self):
+        return 256

-                    answer = (float(logits.sum()), bool(max_equal))
+    @property
+    def batch_size(self):
+        # TODO: fix multi-gpu
+        return self.batch_size_per_gpu  # * gpus

-                    # partial caching
-                    if cache_key is not None:
-                        self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+    @property
+    def device(self):
+        # TODO: fix multi-gpu
+        return self._device

-                    res.append(answer)
+    def tok_encode(self, string: str):
+        return self.tokenizer.encode(string, add_special_tokens=False)
    
-        return reord.get_original(res)
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)

    def _model_call(self, inps):
        """
@@ -203,43 +85,19 @@ class GPT2LM(LM):
        the size of sequence may vary from call to call

        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits retuned from the model
+        logits returned from the model
        """
+        with torch.no_grad():
            return self.gpt2(inps)[0][:, :, :50257]
    
-    def greedy_until(self, requests):
-        # TODO: implement fully general `until` that handles untils that are 
-        # multiple tokens or that span multiple tokens correctly
-        res = []
-
-        def _collate(x):
-            toks = self.tokenizer.encode(x[0], add_special_tokens=False)
-            return (len(toks), x[0])
-        
-        reord = utils.Reorderer(requests, _collate)
-
-        for context, until in tqdm(reord.get_reordered()):
-            if isinstance(until, str): until = [until]
-
-            context_enc = torch.tensor([self.tokenizer.encode(context, add_special_tokens=False)[self.MAX_GEN_TOKS - self.max_length:]]).to(self.device)
-
-            primary_until, = self.tokenizer.encode(until[0], add_special_tokens=False)
-
-            cont = self.gpt2.generate(
-                context_enc,
-                max_length=context_enc.shape[1] + self.MAX_GEN_TOKS,
-                eos_token_id=primary_until,
+    def _model_generate(self, context, max_length, eos_token_id):
+        return self.gpt2.generate(
+            context,
+            max_length=max_length,
+            eos_token_id=eos_token_id,
            do_sample=False
        )

-            s = self.tokenizer.decode(cont[0].tolist()[context_enc.shape[1]:])
-
-            for term in until:
-                s = s.split(term)[0]
-            
-            # partial caching
-            self.cache_hook.add_partial("greedy_until", (context, until), s)
-            
-            res.append(s)

-        return reord.get_original(res)
+# for backwards compatibility
+GPT2LM = HFLM
--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
 import os
 import numpy as np
 import transformers
-from lm_eval.base import LM
+from lm_eval.base import BaseLM
 from lm_eval import utils
 from tqdm import tqdm
 import time


 def get_result(response, ctxlen):
+    """Process results from OpenAI API response.
+
+    :param response: dict
+        OpenAI API Response
+    :param ctxlen: int
+        Length of context (so we can slice them away and only keep the predictions)
+    :return:
+        continuation_logprobs: np.array
+            Log probabilities of continuation tokens
+        is_greedy: bool
+            whether argmax matches given continuation exactly
+    """
    is_greedy = True
    logprobs = response["logprobs"]["token_logprobs"]
    continuation_logprobs = sum(logprobs[ctxlen:])
@@ -24,8 +36,11 @@ def get_result(response, ctxlen):


 def oa_completion(**kwargs):
-    import openai
+    """ Query OpenAI API for completion.

+    Retry with back-off until they respond
+    """
+    import openai
    backoff_time = 3
    while True:
        try:
@@ -35,11 +50,8 @@ def oa_completion(**kwargs):
            backoff_time *= 1.5


-class GPT3LM(LM):
-
-    MAX_LENGTH = 2048
+class GPT3LM(BaseLM):
    REQ_CHUNK_SIZE = 20
-    MAX_GEN_TOKS = 256

    def __init__(self, engine, truncate=False):
        """
@@ -50,10 +62,12 @@ class GPT3LM(LM):
            Truncate input if too long (if False and input is too long, throw error)
        """
        super().__init__()
+
        import openai
        self.engine = engine
        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')

+        self.vocab_size = self.tokenizer.vocab_size

        # to make the annoying "Using pad_token, but it is not set yet." error go away
        self.tokenizer.pad_token = "<|endoftext|>"
@@ -64,53 +78,36 @@ class GPT3LM(LM):
        # Read from environment variable OPENAI_API_SECRET_KEY
        openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]

-    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config={}):
-        args = utils.simple_parse_args_string(arg_string)
-        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        return cls(**args, **args2)
-
-    def loglikelihood(self, requests):
-        new_reqs = []
-        for context, continuation in requests:
-            if context == "":
-                # end of text as context
-                context_enc = [50256]
-            else:
-                context_enc = self.tokenizer.encode(context)
-
-            continuation_enc = self.tokenizer.encode(continuation)
-
-            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-
-        return self._loglikelihood_tokens(new_reqs)
-
-    def loglikelihood_rolling(self, requests):
-        # TODO: switch implementation to use _loglikelihood_tokens rather than having it do its own thing
-
-        loglikelihoods = []
-        for string, in tqdm(requests):
-            encoded = self.tokenizer.encode_plus(string)["input_ids"]
-            rolling_token_windows = utils.get_rolling_token_windows(
-                token_list=encoded,
-                prefix_token=self.end_of_text_token_id,
-                max_seq_len=self.MAX_LENGTH,
-                context_len=1,
-            )
-            string_loglikelihoods = []
-            for input_tokens, pred_tokens in rolling_token_windows:
-                block_output = self.get_token_logprobs(
-                    input_tokens=input_tokens,
-                    pred_tokens=pred_tokens,
-                )
-                string_loglikelihoods.append(block_output["logprobs"])
-            string_loglikelihoods = np.concatenate(string_loglikelihoods).sum()
-            loglikelihoods.append(string_loglikelihoods)
+    @property
+    def eot_token_id(self):
+        return self.tokenizer.eos_token_id

-        return loglikelihoods
+    @property
+    def max_length(self):
+        # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
+        return 2048

-    def _loglikelihood_tokens(self, requests):
-        import openai
+    @property
+    def max_gen_toks(self):
+        return 256
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def tok_encode(self, string: str):
+        return self.tokenizer.encode(string, add_special_tokens=False)
+    
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
        res = []

        def _collate(x):
@@ -118,16 +115,18 @@ class GPT3LM(LM):
            # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
            # we care about and so we need some kind of backup for when it isn't
            toks = x[1] + x[2]
-            return (-len(toks), tuple(toks))
+            return -len(toks), tuple(toks)
        
        reord = utils.Reorderer(requests, _collate)

-        for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
+        for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)), disable=disable_tqdm):
            inps = []
            ctxlens = []
            for cache_key, context_enc, continuation_enc in chunk:
-                inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:]
-                ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH)
+                # max_length+1 because the API takes up to 2049 tokens, including the first context token
+                inp = (context_enc + continuation_enc)[-(self.max_length+1):]
+                # TODO: the logic is much simpler if we just look at the length of continuation tokens
+                ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - (self.max_length+1))

                inps.append(inp)
                ctxlens.append(ctxlen)
@@ -151,35 +150,14 @@ class GPT3LM(LM):

        return reord.get_original(res)

-    def get_token_logprobs(self, input_tokens, pred_tokens):
-        pred_start = len(input_tokens) - len(pred_tokens) + 1
-        # We're going to stitch together the input_tokens and pred_tokens
-        # In the longest case, this gets us to length = max_seq_len+1 (which the API works with)
-        assert input_tokens[pred_start:] == pred_tokens[:-1]
-        token_ids = input_tokens + [pred_tokens[-1]]
-        response = oa_completion(
-            engine=self.engine,
-            prompt=token_ids,
-            max_tokens=0,
-            temperature=0.0,
-            logprobs=0,
-            echo=True,
-        )
-        logprobs = np.array(response["choices"][0]["logprobs"]["token_logprobs"][pred_start:])
-        positions = np.arange(pred_start-1, pred_start-1 + len(token_ids[pred_start:]))
-        return {
-            "logprobs": logprobs,
-            "positions": positions,
-        }
-
    def greedy_until(self, requests):
-        if not requests: return []
-        import openai
+        if not requests:
+            return []
        res = []

        def _collate(x):
-            toks = self.tokenizer.encode(x[0])
-            return (len(toks), x[0])
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
        
        reord = utils.Reorderer(requests, _collate)

@@ -193,34 +171,43 @@ class GPT3LM(LM):
                    lastuntil = x[1]
                ret.append(x)
            
-            if ret: yield ret, lastuntil
+            if ret:
+                yield ret, lastuntil

-        # todo: more intelligent batching for heterogenous `until`
+        # todo: more intelligent batching for heterogeneous `until`
        for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
            inps = []
            for context, _ in chunk:
-                context_enc = self.tokenizer.encode(context)
-                inp = context_enc[-(self.MAX_LENGTH - self.MAX_GEN_TOKS):]
+                context_enc = self.tok_encode(context)
+                inp = context_enc[-(self.max_length - self.max_gen_toks):]
                inps.append(inp)

            response = oa_completion(
                engine=self.engine,
                prompt=inps,
-                max_tokens=self.MAX_GEN_TOKS, 
+                max_tokens=self.max_gen_toks, 
                temperature=0.,
                logprobs=10,
-                stop=until
+                stop=until,
            )

-            for resp, (context, until) in zip(response.choices, chunk):
+            for resp, (context, until_) in zip(response.choices, chunk):
                s = resp['text']

-                for term in until:
+                for term in until_:
                    s = s.split(term)[0]

                # partial caching
-                self.cache_hook.add_partial("greedy_until", (context, until), s)
+                self.cache_hook.add_partial("greedy_until", (context, until_), s)
                
                res.append(s)
        
        return reord.get_original(res)
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override greedy_until
+        raise NotImplementedError()
--- a/main.py
+++ b/main.py
 import argparse
 import json
-import numpy as np
-import random
 import logging

-from lm_eval import models, tasks, evaluator, base
+from lm_eval import tasks, evaluator

 logging.getLogger("openai").setLevel(logging.WARNING)

+
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', required=True)
@@ -22,9 +21,10 @@ def parse_args():
    parser.add_argument('--no_cache', action="store_true")
    return parser.parse_args()

-def main():

+def main():
    args = parse_args()
+    assert not args.provide_description  # not implemented
    
    if args.limit:
        print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
@@ -35,15 +35,14 @@ def main():
        task_names = args.tasks.split(",")

    results = evaluator.simple_evaluate(
-        args.model,
-        args.model_args,
-        task_names,
-        args.description_path,
-        args.num_fewshot,
-        args.batch_size,
-        args.device,
-        args.no_cache,
-        args.limit
+        model=args.model,
+        model_args=args.model_args,
+        task_names=task_names,
+        num_fewshot=args.num_fewshot,
+        batch_size=args.batch_size,
+        device=args.device,
+        no_cache=args.no_cache,
+        limit=args.limit,
    )

    dumped = json.dumps(results, indent=2)
@@ -54,8 +53,12 @@ def main():
        with open(args.output_path, "w") as f:
            f.write(dumped)

-    print(f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}")
+    print(
+        f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
+        f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
+    )
    print(evaluator.make_table(results))

+
 if __name__ == "__main__":
    main()
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:

 setuptools.setup(
    name="lm_eval",
-    version="0.0.1",
+    version="0.1.0",
    author="Leo Gao",
    author_email="lg@eleuther.ai",
    description="A framework for evaluating autoregressive language models",
@@ -20,7 +20,7 @@ setuptools.setup(
    ],
    python_requires='>=3.6',
    install_requires=[
-        "black==20.8b1",
+        "black",
        "best_download>=0.0.6",
        "datasets==1.15.1",
        "click>=7.1",

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -10,8 +10,8 @@ import pytest
 # TODO: more fine grained unit tests rather than this big honking integration
 # test once we break evaluator into smaller, more manageable pieces

-@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
-def test_evaluator(taskname, Task):
+@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
+def test_evaluator(taskname, task_class):
    task_dict = tasks.get_task_dict([taskname])

    os.system("rm test_cache.db")
@@ -19,7 +19,8 @@ def test_evaluator(taskname, Task):

    def ll_fn(reqs):
        for ctx, cont in reqs:
-            if len(ctx) == 0: continue
+            if len(ctx) == 0:
+                continue
            # space convention
            assert ctx[-1] != ' '
            assert cont[0] == ' ' or ctx[-1] == '\n'
@@ -50,5 +51,5 @@ def test_evaluator(taskname, Task):
    e1 = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
    e2 = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)

-    # check taht caching is working
+    # check that caching is working
    assert e1 == e2
--- a/tests/test_gpt3.py
+++ b/tests/test_gpt3.py
-import lm_eval.tasks as tasks
 import lm_eval.models as models
-import lm_eval.evaluator as evaluator
-import random
 import pytest
 import os
 import json
@@ -10,10 +7,11 @@ import mock
 import pickle
 import hashlib

-os.environ['OPENAI_API_SECRET_KEY'] = ""

-
-def completion(**kwargs):
+def mock_completion(**kwargs):
+    # Mock completion function
+    # Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
+    os.makedirs("tests/testdata", exist_ok=True)
    hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest()
    fname = f"tests/testdata/gpt3_test_{hash}.pkl"

@@ -21,16 +19,15 @@ def completion(**kwargs):
        with open(fname, 'rb') as fh:
            return pickle.load(fh)
    ret = openai.Completion.create(**kwargs)
+    ret.api_key = ""
    with open(fname, 'wb') as fh:
        pickle.dump(ret, fh)
    return ret


-os.makedirs("tests/testdata", exist_ok=True)
-
-
-@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
+@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
 def test_gpt3():
+    if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
    gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
    (ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([
        ('The quick brown fox jumps over the lazy', ' dog'),
@@ -69,15 +66,18 @@ def test_gpt3():

    print([x[0] for x in vals])

-    targets = [-34.85833048, -47.114367866, -45.43520782100001, -5.289627985, -133.96879783896998, -321.30299892039994, -658.0542459504098, -34.85833048, -7.5162964]
+    targets = [
+        -34.848301606999996, -47.148329679999996, -45.44380149599999, -5.285246016, -133.97821690686004,
+        -321.2616693239001, -658.0299524401041, -34.848301606999996, -7.525115,
+    ]

    for (pred, _), tgt in zip(vals, targets):
        assert pred == pytest.approx(tgt, rel=1e-3)


-
-@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
+@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
 def test_gpt3_perplexity():
+    if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
    gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
    perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
@@ -85,7 +85,9 @@ def test_gpt3_perplexity():
    assert perplexity == pytest.approx(tgt, rel=1e-3)

    # Hack: modify gpt3 to have shorter context length to induce rolling windows
-    gpt3.MAX_LENGTH = 5
+    with mock.patch.object(models.gpt3.GPT3LM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
+        mock_max_length.return_value = 5
+        gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
        perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
-    tgt = -101.93490880000002
+    tgt = -101.81967209999999
    assert perplexity == pytest.approx(tgt, rel=1e-3)
--- a/tests/test_models.py
+++ b/tests/test_models.py
 import pytest
+import unittest.mock as mock
 import lm_eval.models as models


@@ -38,22 +39,31 @@ def test_gpt2():

    assert gen == ', lazy fox and they both fall to the ground'

-    targets = [-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188, -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281]
+    targets = [
+        -61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188,
+        -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281
+    ]

    for (pred, _), tgt in zip(vals, targets):
        assert pred == pytest.approx(tgt, rel=1e-3)


-
 def test_gpt2_perplexity():
    gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
    perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
-    tgt = sum([-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072, -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487])
+    tgt = sum([
+        -4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072,
+        -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487,
+    ])
    assert perplexity == pytest.approx(tgt, rel=1e-3)

-    # Hack: modify gpt2 to have shorter context length to induce rolling windows
-    gpt2.max_length = 5
+    with mock.patch.object(models.gpt2.HFLM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
+        mock_max_length.return_value = 5
+        gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
        perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
-    tgt = sum([-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891, -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813])
+    tgt = sum([
+        -4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891,
+        -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813,
+    ])
    assert perplexity == pytest.approx(tgt, rel=1e-3)
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -4,13 +4,13 @@ import pytest
 from itertools import islice


-@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
-def test_basic_interface(taskname, Task):
+@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
+def test_basic_interface(taskname, task_class):
    print('Evaluating task', taskname)
-    #dl = Task.download
-    #Task.download = MagicMock()
-    task = Task()
-    #Task.download = dl
+    # dl = task_class.download
+    # task_class.download = MagicMock()
+    task = task_class()
+    # task_class.download = dl

    assert task.has_training_docs() in [True, False]
    assert task.has_validation_docs() in [True, False]
@@ -20,18 +20,20 @@ def test_basic_interface(taskname, Task):
    assert isinstance(task.higher_is_better(), dict)
    assert task.aggregation().keys() == task.higher_is_better().keys()

-    for v in task.higher_is_better().values(): assert v in [True, False]
+    for v in task.higher_is_better().values():
+        assert v in [True, False]

    assert isinstance(task.VERSION, int)

    # test deterministic docs
    # (don't test train because it's slow)

-    task2 = Task()
+    task2 = task_class()

    limit = None

-    if taskname in ["triviaqa"]: limit = 10000
+    if taskname in ["triviaqa"]:
+        limit = 10000
    if task.has_validation_docs():
        arr = list(islice(task.validation_docs(), limit))
        arr2 = list(islice(task2.validation_docs(), limit))
@@ -66,18 +68,20 @@ def test_basic_interface(taskname, Task):
        assert reqs == reqs2


-@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
-def test_documents_and_requests(taskname, Task):
+@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
+def test_documents_and_requests(taskname, task_class):
    print('Evaluating task', taskname)
-    task = Task()
+    task = task_class()
    fns = []
-    if task.has_training_docs(): fns.append(task.training_docs)
-    if task.has_validation_docs(): fns.append(task.validation_docs)
+    if task.has_training_docs():
+        fns.append(task.training_docs)
+    if task.has_validation_docs():
+        fns.append(task.validation_docs)
    # test doc might not have labels
-    #if task.has_test_docs(): fns.append(task.test_docs)
+    # if task.has_test_docs(): fns.append(task.test_docs)

    for fn in fns:
-        #print(list(islice(fn(), 10)))
+        # print(list(islice(fn(), 10)))
        for doc in islice(fn(), 10):
            
            txt = task.doc_to_text(doc)
@@ -95,7 +99,8 @@ def test_documents_and_requests(taskname, Task):
            reqs = task.construct_requests(doc, txt)
            
            # construct_requests can return just one request
-            if not isinstance(reqs, (list, tuple)): reqs = [reqs]
+            if not isinstance(reqs, (list, tuple)):
+                reqs = [reqs]

            # todo: mock lm after refactoring evaluator.py to not be a mess
            for req in reqs:

--- a/tests/test_version_stable.py
+++ b/tests/test_version_stable.py
@@ -25,6 +25,7 @@ def assert_target(name, ob):
        with open(fname, 'w') as fh:
            json.dump(ob, fh, sort_keys=True)

+
 def assert_target_hashed(name, ob):
    fname = f"tests/testdata/{name}"
    if os.path.exists(fname):
@@ -48,19 +49,20 @@ def flatten(d, parent_key='', sep='.'):

 # make sure eval results for a task version are stable

-@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
-def test_versions_stable(taskname, Task):
+@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
+def test_versions_stable(taskname, task_class):
    task_dict = tasks.get_task_dict([taskname])
    lm = models.get_model('dummy')()

    def ll_fn(reqs):
        for ctx, cont in reqs:
-            if len(ctx) == 0: continue
+            if len(ctx) == 0:
+                continue
            # space convention
            assert ctx[-1] != ' '
            assert cont[0] == ' ' or ctx[-1] == '\n'
        
-        assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood", reqs)
+        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood", reqs)
        res = []
        
        random.seed(42)
@@ -73,7 +75,7 @@ def test_versions_stable(taskname, Task):
        for string, in reqs:
            assert isinstance(string, str)

-        assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood_rolling", reqs)
+        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood_rolling", reqs)
        res = []

        random.seed(42)
@@ -84,7 +86,7 @@ def test_versions_stable(taskname, Task):
    
    def greedy_until(reqs):
        res = []
-        assert_target_hashed(f"{taskname}-v{Task.VERSION}-greedy_until", reqs)
+        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-greedy_until", reqs)
        
        for ctx, _ in reqs:
            res.append("lol")
@@ -97,5 +99,5 @@ def test_versions_stable(taskname, Task):
    lm.greedy_until = greedy_until

    limit = None
-    res = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
-    assert_target(f"{taskname}-v{Task.VERSION}-res", res)
+    result = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
+    assert_target(f"{taskname}-v{task_class.VERSION}-res", result)
--- a/tests/testdata/gpt3_test_0deb8e9bde8e8327bbc48157f638ff3ba06b0cd816dad2beb8ad90f7fbe795c7.pkl
+++ b/tests/testdata/gpt3_test_0deb8e9bde8e8327bbc48157f638ff3ba06b0cd816dad2beb8ad90f7fbe795c7.pkl
--- a/tests/testdata/gpt3_test_57ec3d53a1dca09a4d4eca161692ad3c5f42b1a033d1315ce096ff67eb45f4b8.pkl
+++ b/tests/testdata/gpt3_test_57ec3d53a1dca09a4d4eca161692ad3c5f42b1a033d1315ce096ff67eb45f4b8.pkl
--- a/tests/testdata/gpt3_test_6e1182575a66b5d7fd9cfd5276d4f77d00932dc587870352e881c10347e00bc5.pkl
+++ b/tests/testdata/gpt3_test_6e1182575a66b5d7fd9cfd5276d4f77d00932dc587870352e881c10347e00bc5.pkl
--- a/tests/testdata/gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl
+++ b/tests/testdata/gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl
--- a/tests/testdata/gpt3_test_823a1a729bdb9f91884b1b986b2fa400aabd8436224328b60fa2314d43e779d2.pkl
+++ b/tests/testdata/gpt3_test_823a1a729bdb9f91884b1b986b2fa400aabd8436224328b60fa2314d43e779d2.pkl
--- a/tests/testdata/gpt3_test_941d8b6f1eba82d9575bbdc7053ec97fc8d77844679199101d00f1096c133a83.pkl
+++ b/tests/testdata/gpt3_test_941d8b6f1eba82d9575bbdc7053ec97fc8d77844679199101d00f1096c133a83.pkl
--- a/tests/testdata/gpt3_test_b795dbbd09256ac8c903e9f7e6dd247aad0dd32cd17108f9e8fa628b9424b9da.pkl
+++ b/tests/testdata/gpt3_test_b795dbbd09256ac8c903e9f7e6dd247aad0dd32cd17108f9e8fa628b9424b9da.pkl