Merge remote-tracking branch 'origin/master' into fazz/refactor-task-coqa

a21df355 · thefazzer · efa810f0 · 1815286c · a21df355 · efa810f0
Commit a21df355 authored Feb 04, 2021 by thefazzer
6 changed files
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -29,3 +29,14 @@ def simple_parse_args_string(args_string):
 def join_iters(iters):
    for iter in iters:
        yield from iter
+
+
+def chunks(iter, n):
+    arr = []
+    for x in iter:
+        arr.append(x)
+        if len(arr) == n:
+            yield arr
+            arr = []
+    
+    if arr: yield arr
\ No newline at end of file
--- a/lm_eval/utils_stream.py
+++ b/lm_eval/utils_stream.py
-import os
-from functools import reduce
-import operator
-from tqdm import tqdm
-import json
-
-
-class ExitCodeError(Exception):
-    pass
-
-
-def sh(x):
-    if os.system(x):
-        raise ExitCodeError()
-
-def ls(x):
-    return [x + '/' + fn for fn in os.listdir(x)]
-
-def lsr(x):
-    if os.path.isdir(x):
-        return reduce(operator.add, map(lsr, ls(x)), [])
-    else:
-        return [x]
-
-def fwrite(fname, content):
-    with open(fname, 'w') as fh:
-        fh.write(content)
-
-def fread(fname):
-    with open(fname) as fh:
-        return fh.read()
-
-class each:
-    def __init__(self, f):
-        self.f = f
-
-    def __rrshift__(self, other):
-        return list(map(self.f, other))
-
-class filt:
-    def __init__(self, f):
-        self.f = f
-
-    def __rrshift__(self, other):
-        return list(filter(self.f, other))
-
-class apply:
-    def __init__(self, f):
-        self.f = f
-
-    def __rrshift__(self, other):
-        return self.f(other)
-
-class one:
-    def __rrshift__(self, other):
-        try:
-            if isinstance(other, list): 
-                assert len(other) == 1
-                return other[0]
-            return next(other)
-        except:
-            return None
-
-class join:
-    def __init__(self, sep):
-        self.sep = sep
-
-    def __rrshift__(self, other):
-        if other is None:
-            return
-        try:
-            return self.sep.join(other)
-        except:
-            return None
-
-
-Y = object()
-
-def id(x):
-    return x
-
-class Reflective:
-    def __getattribute__(self, f):
-        def _fn(*args, **kwargs):
-            return lambda x: x.__getattribute__(f)(*args, **kwargs)
-        return _fn
-    
-    def __getitem__(self, a):
-        return lambda x: x[a]
-    
-    def __mul__(self, other):
-        if other == Y:
-            def _f(x, y=None):
-                if y == None:
-                    x, y = x
-                
-                return x * y
-            return  _f
-        
-        return lambda x: x * other
-    
-    def __rmul__(self, other):
-        if other == Y:
-            def _f(x, y=None):
-                if y == None:
-                    x, y = x
-                
-                return y * x
-            return  _f
-        
-        return lambda x: other * x
-    
-    def __add__(self, other):
-        if other == Y:
-            def _f(x, y=None):
-                if y == None:
-                    x, y = x
-                
-                return x + y
-            return  _f
-        
-        return lambda x: x + other
-    
-    def __radd__(self, other):
-        if other == Y:
-            def _f(x, y=None):
-                if y == None:
-                    x, y = x
-                
-                return y + x
-            return  _f
-        
-        return lambda x: other + x
-
-# (b -> a -> b) -> b -> [a] -> b
-def foldl(f, init, arr):
-    curr = init
-    for elem in arr:
-        curr = f(curr, elem)
-    return curr
-
-# (a -> b -> b) -> b -> [a] -> b
-def foldr(f, init, arr):
-    curr = init
-    for elem in arr[::-1]:
-        curr = f(elem, curr)
-    return curr
-
-
-def comp(*fs):
-    if len(fs) == 1:
-        return fs[0]
-    
-    def _f(x):
-        for f in fs[::-1]:
-            x = f(x)
-    
-        return x
-    return _f
-
-
-X = Reflective()
--- a/main.py
+++ b/main.py
@@ -5,7 +5,7 @@ import random
 import itertools
 import collections

-from lm_eval import models, tasks
+from lm_eval import models, tasks, evaluator


 def parse_args():
@@ -32,75 +32,7 @@ def main():
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

-    # TODO: fall back to test docs
-    task_dict_items = [(name, task) for name, task in task_dict.items() if task.has_validation_docs()]
-
-    results = collections.defaultdict(dict)
-
-    requests = collections.defaultdict(list)
-    requests_origin = collections.defaultdict(list)
-
-    # if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
-    # we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
-    # (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
-
-    # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
-
-    docs = {}
-
-    # get lists of each type of requeste
-    for task_name, task in task_dict_items:
-        for doc_id, doc in enumerate(itertools.islice(task.validation_docs(), 0, args.limit)):
-            docs[(task_name, doc_id)] = doc
-
-            ctx = task.fewshot_context(
-                doc=doc,
-                provide_description=args.provide_description,
-                num_fewshot=args.num_fewshot,
-            )
-
-            reqs = task.construct_requests(doc, ctx)
-
-            for i, req in enumerate(reqs):
-                requests[req.type].append(req)
-                # i: index in requests for a single task instance
-                # doc_id: unique id that we can get back to a doc using `docs`
-                requests_origin[req.type].append((i, task_name, doc, doc_id))
-
-    # all responses for each (task, doc)
-    process_res_queue = collections.defaultdict(list)
-
-    # execute each type of request
-    for reqtype, reqs in requests.items():
-        # TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
-        # only in index. We could implement some kind of caching, but that would be more of a bandaid
-        # solution. we could also implement some kind of autogrouping here; they should end up next to each other.
-
-        resps = getattr(lm, reqtype)([req.args for req in reqs])
-
-        resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
-
-        for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
-            process_res_queue[(task_name, doc_id)].append((i, resp))
-    
-    vals = collections.defaultdict(list)
-
-    # unpack results and sort back in order and return control to Task
-    for (task_name, doc_id), requests in process_res_queue.items():
-        requests.sort(key=lambda x: x[0])
-        requests = [x[1] for x in requests]
-
-        task = task_dict[task_name]
-        doc = docs[(task_name, doc_id)]
-
-        metrics = task.process_results(doc, requests)
-        for metric, value in metrics.items():
-            vals[(task_name, metric)].append(value)
-    
-    # aggregate results
-    for (task_name, metric), items in vals.items():
-        task = task_dict[task_name]
-        results[task_name][metric] = task.aggregation()[metric](items)
+    results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit)

    dumped = json.dumps(results, indent=2)
    print(dumped)

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
+import lm_eval.tasks as tasks
+import lm_eval.models as models
+import lm_eval.evaluator as evaluator
+import pytest
+
+
+# TODO: more fine grained unit tests rather than this big honking integration
+# test once we break evaluator into smaller, more manageable pieces
+
+@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
+def test_evaluator(taskname, Task):
+    task_dict = tasks.get_task_dict([taskname])
+    lm = models.get_model('dummy')()
+    evaluator.evaluate(lm, task_dict, False, 0, 10)
\ No newline at end of file
--- a/tests/test_models.py
+++ b/tests/test_models.py
+import lm_eval.models as models
+import lm_eval.base as base
+
+def test_gpt2():
+    gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
+    (ll_dog, ig_dog), (ll_cat, ig_cat) = gpt2.loglikelihood([
+        ('The quick brown fox jumps over the lazy', ' dog'),
+        ('The quick brown fox jumps over the lazy', ' cat'),
+    ])
+
+    assert ll_dog > ll_cat
+    assert not ig_cat
+
--- a/tests/test_all_sanitycheck.py
+++ b/tests/test_all_sanitycheck.py
@@ -8,10 +8,10 @@ import pytest
 @pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
 def test_basic_interface(taskname, Task):
    print('Evaluating task', taskname)
-    dl = Task.download
-    Task.download = MagicMock()
+    #dl = Task.download
+    #Task.download = MagicMock()
    task = Task()
-    Task.download = dl
+    #Task.download = dl

    assert task.has_training_docs() in [True, False]
    assert task.has_validation_docs() in [True, False]
@@ -46,6 +46,6 @@ def test_documents_and_requests(taskname, Task):

            reqs = task.construct_requests(doc, txt)

-            # todo: mock lm by pluggin what's currently in main.py in here
+            # todo: mock lm after refactoring evaluator.py to not be a mess
            for req in reqs:
-                assert isinstance(req, base.Request)
\ No newline at end of file
+                assert isinstance(req, base.Request)