Massive refactor

- Extract evaluator (still needs work to clean up) - Add tests for evaluator - Fix all the things that break on the new tests - Misc cleanup

Massive refactor
- Extract evaluator (still needs work to clean up) - Add tests for evaluator - Fix all the things that break on the new tests - Misc cleanup
778e0f91 · Leo Gao · 52c1c56a · 778e0f91 · 778e0f91 · 778e0f91
Commit 778e0f91 authored Feb 03, 2021 by Leo Gao
11 changed files
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -200,7 +200,8 @@ def f1_score(items):
    golds = unzipped_list[0]
    preds = unzipped_list[1]
    fscore = sklearn.metrics.f1_score(golds, preds)
-    return max(fscore)
+    return np.max(fscore)
 def acc_all(items):

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
+import collections
+import itertools
+def evaluate(lm, task_dict, provide_description, num_fewshot, limit):
+    # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
+    task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
+    results = collections.defaultdict(dict)
+    requests = collections.defaultdict(list)
+    requests_origin = collections.defaultdict(list)
+    # if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
+    # we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
+    # (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
+    # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
+    docs = {}
+    # get lists of each type of requeste
+    for task_name, task in task_dict_items:
+        #default to validation doc, fall back to test doc if validation unavailable
+        # TODO: the val-fallback-to-test system isn't final, we should revisit it at some point
+        if task.has_validation_docs():
+            task_doc_func = task.validation_docs
+        elif task.has_test_docs():
+            task_doc_func = task.test_docs
+        for doc_id, doc in enumerate(itertools.islice(task_doc_func(), 0, limit)):
+            docs[(task_name, doc_id)] = doc
+            ctx = task.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            reqs = task.construct_requests(doc, ctx)
+            for i, req in enumerate(reqs):
+                requests[req.type].append(req)
+                # i: index in requests for a single task instance
+                # doc_id: unique id that we can get back to a doc using `docs`
+                requests_origin[req.type].append((i, task_name, doc, doc_id))
+    # all responses for each (task, doc)
+    process_res_queue = collections.defaultdict(list)
+    # execute each type of request
+    for reqtype, reqs in requests.items():
+        # TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
+        # only in index. We could implement some kind of caching, but that would be more of a bandaid
+        # solution. we could also implement some kind of autogrouping here; they should end up next to each other.
+        resps = getattr(lm, reqtype)([req.args for req in reqs])
+        resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
+        for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
+            process_res_queue[(task_name, doc_id)].append((i, resp))
+    vals = collections.defaultdict(list)
+    # unpack results and sort back in order and return control to Task
+    for (task_name, doc_id), requests in process_res_queue.items():
+        requests.sort(key=lambda x: x[0])
+        requests = [x[1] for x in requests]
+        task = task_dict[task_name]
+        doc = docs[(task_name, doc_id)]
+        metrics = task.process_results(doc, requests)
+        for metric, value in metrics.items():
+            vals[(task_name, metric)].append(value)
+    # aggregate results
+    for (task_name, metric), items in vals.items():
+        task = task_dict[task_name]
+        results[task_name][metric] = task.aggregation()[metric](items)
+    return results
\ No newline at end of file
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
 from . import gpt2
 from . import gpt3
+from . import dummy
 MODEL_REGISTRY = {
    "gpt2": gpt2.GPT2LM,
    "gpt3": gpt3.GPT3LM,
+    "dummy": dummy.DummyLM,
 }

--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
@@ -12,7 +12,6 @@ def get_result(response, ctxlen):
    for i in range(ctxlen, len(response["logprobs"]["tokens"])):
        token = response["logprobs"]["tokens"][i]
-        print('TOK', token, response["logprobs"]["top_logprobs"][i])
        top_tokens = response["logprobs"]["top_logprobs"][i]
        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
        if top_token != token:

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -37,7 +37,7 @@ TASK_REGISTRY = {
    "cb": superglue.CommitmentBank,
    "copa": superglue.Copa,
    "multirc": superglue.MultiRC,
-    "record": superglue.ReCoRD,
+    #"record": superglue.ReCoRD,
    "wic": superglue.WordsInContext,
    "wsc": superglue.SGWinogradSchemaChallenge,

--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
@@ -52,8 +52,6 @@ class LAMBADA(Task):
    def process_results(self, doc, results):
        ll, is_greedy = results
-        print(ll)
        return {
            'perplexity': ll,
            'accuracy': int(is_greedy)

--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -261,7 +261,7 @@ class ReCoRD(HFTask):
        return True
    def has_test_docs(self):
-        return True
+        return False
    def fewshot_description(self):
        # TODO: figure out actual description
@@ -322,6 +322,7 @@ class ReCoRD(HFTask):
        # - Evaluate the accuracy and token F1 PER EXAMPLE
        # - Average over all examples
        max_idx = np.argmax(np.array(results))
        prediction = doc["entities"][max_idx]
        gold_label_set = list(set(doc["answers"]))
        f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)

--- a/main.py
+++ b/main.py
@@ -5,7 +5,7 @@ import random
 import itertools
 import collections
-from lm_eval import models, tasks
+from lm_eval import models, tasks, evaluator
 def parse_args():
@@ -32,81 +32,7 @@ def main():
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)
-    task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
+    results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit)
-    results = collections.defaultdict(dict)
-    requests = collections.defaultdict(list)
-    requests_origin = collections.defaultdict(list)
-    # if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
-    # we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
-    # (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
-    # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
-    docs = {}
-    # get lists of each type of requeste
-    for task_name, task in task_dict_items:
-        #default to validation doc, fall back to test doc if validation unavailable
-        # TODO: the val-fallback-to-test system isn't final, we should revisit it at some point
-        if task.has_validation_docs():
-            task_doc_func = task.validation_docs
-        elif task.has_test_docs():
-            task_doc_func = task.test_docs
-        for doc_id, doc in enumerate(itertools.islice(task_doc_func(), 0, args.limit)):
-            docs[(task_name, doc_id)] = doc
-            ctx = task.fewshot_context(
-                doc=doc,
-                provide_description=args.provide_description,
-                num_fewshot=args.num_fewshot,
-            )
-            reqs = task.construct_requests(doc, ctx)
-            for i, req in enumerate(reqs):
-                requests[req.type].append(req)
-                # i: index in requests for a single task instance
-                # doc_id: unique id that we can get back to a doc using `docs`
-                requests_origin[req.type].append((i, task_name, doc, doc_id))
-    # all responses for each (task, doc)
-    process_res_queue = collections.defaultdict(list)
-    # execute each type of request
-    for reqtype, reqs in requests.items():
-        # TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
-        # only in index. We could implement some kind of caching, but that would be more of a bandaid
-        # solution. we could also implement some kind of autogrouping here; they should end up next to each other.
-        resps = getattr(lm, reqtype)([req.args for req in reqs])
-        resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
-        for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
-            process_res_queue[(task_name, doc_id)].append((i, resp))
-    vals = collections.defaultdict(list)
-    # unpack results and sort back in order and return control to Task
-    for (task_name, doc_id), requests in process_res_queue.items():
-        requests.sort(key=lambda x: x[0])
-        requests = [x[1] for x in requests]
-        task = task_dict[task_name]
-        doc = docs[(task_name, doc_id)]
-        metrics = task.process_results(doc, requests)
-        for metric, value in metrics.items():
-            vals[(task_name, metric)].append(value)
-    # aggregate results
-    for (task_name, metric), items in vals.items():
-        task = task_dict[task_name]
-        results[task_name][metric] = task.aggregation()[metric](items)
    dumped = json.dumps(results, indent=2)
    print(dumped)

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
+import lm_eval.tasks as tasks
+import lm_eval.models as models
+import lm_eval.evaluator as evaluator
+import pytest
+# TODO: more fine grained unit tests rather than this big honking integration
+# test once we break evaluator into smaller, more manageable pieces
+@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
+def test_evaluator(taskname, Task):
+    task_dict = tasks.get_task_dict([taskname])
+    lm = models.get_model('dummy')()
+    evaluator.evaluate(lm, task_dict, False, 0, 10)
\ No newline at end of file
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -2,7 +2,7 @@ import lm_eval.models as models
 import lm_eval.base as base
 def test_gpt2():
-    gpt2 = models.get_model('gpt2')(device="cpu")
+    gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
    (ll_dog, ig_dog), (ll_cat, ig_cat) = gpt2.loglikelihood([
        ('The quick brown fox jumps over the lazy', ' dog'),
        ('The quick brown fox jumps over the lazy', ' cat'),

--- a/tests/test_all_sanitycheck.py
+++ b/tests/test_all_sanitycheck.py
@@ -46,6 +46,6 @@ def test_documents_and_requests(taskname, Task):
            reqs = task.construct_requests(doc, txt)
-            # todo: mock lm by pluggin what's currently in main.py in here
+            # todo: mock lm after refactoring evaluator.py to not be a mess
            for req in reqs:
                assert isinstance(req, base.Request)
\ No newline at end of file