Commit a21df355 authored by thefazzer's avatar thefazzer
Browse files

Merge remote-tracking branch 'origin/master' into fazz/refactor-task-coqa

parents efa810f0 1815286c
......@@ -29,3 +29,14 @@ def simple_parse_args_string(args_string):
def join_iters(iters):
for iter in iters:
yield from iter
def chunks(iter, n):
arr = []
for x in iter:
arr.append(x)
if len(arr) == n:
yield arr
arr = []
if arr: yield arr
\ No newline at end of file
import os
from functools import reduce
import operator
from tqdm import tqdm
import json
class ExitCodeError(Exception):
pass
def sh(x):
if os.system(x):
raise ExitCodeError()
def ls(x):
return [x + '/' + fn for fn in os.listdir(x)]
def lsr(x):
if os.path.isdir(x):
return reduce(operator.add, map(lsr, ls(x)), [])
else:
return [x]
def fwrite(fname, content):
with open(fname, 'w') as fh:
fh.write(content)
def fread(fname):
with open(fname) as fh:
return fh.read()
class each:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return list(map(self.f, other))
class filt:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return list(filter(self.f, other))
class apply:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return self.f(other)
class one:
def __rrshift__(self, other):
try:
if isinstance(other, list):
assert len(other) == 1
return other[0]
return next(other)
except:
return None
class join:
def __init__(self, sep):
self.sep = sep
def __rrshift__(self, other):
if other is None:
return
try:
return self.sep.join(other)
except:
return None
Y = object()
def id(x):
return x
class Reflective:
def __getattribute__(self, f):
def _fn(*args, **kwargs):
return lambda x: x.__getattribute__(f)(*args, **kwargs)
return _fn
def __getitem__(self, a):
return lambda x: x[a]
def __mul__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return x * y
return _f
return lambda x: x * other
def __rmul__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return y * x
return _f
return lambda x: other * x
def __add__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return x + y
return _f
return lambda x: x + other
def __radd__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return y + x
return _f
return lambda x: other + x
# (b -> a -> b) -> b -> [a] -> b
def foldl(f, init, arr):
curr = init
for elem in arr:
curr = f(curr, elem)
return curr
# (a -> b -> b) -> b -> [a] -> b
def foldr(f, init, arr):
curr = init
for elem in arr[::-1]:
curr = f(elem, curr)
return curr
def comp(*fs):
if len(fs) == 1:
return fs[0]
def _f(x):
for f in fs[::-1]:
x = f(x)
return x
return _f
X = Reflective()
......@@ -5,7 +5,7 @@ import random
import itertools
import collections
from lm_eval import models, tasks
from lm_eval import models, tasks, evaluator
def parse_args():
......@@ -32,75 +32,7 @@ def main():
task_names = args.tasks.split(",")
task_dict = tasks.get_task_dict(task_names)
# TODO: fall back to test docs
task_dict_items = [(name, task) for name, task in task_dict.items() if task.has_validation_docs()]
results = collections.defaultdict(dict)
requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list)
# if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
# we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
# (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
# TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
docs = {}
# get lists of each type of requeste
for task_name, task in task_dict_items:
for doc_id, doc in enumerate(itertools.islice(task.validation_docs(), 0, args.limit)):
docs[(task_name, doc_id)] = doc
ctx = task.fewshot_context(
doc=doc,
provide_description=args.provide_description,
num_fewshot=args.num_fewshot,
)
reqs = task.construct_requests(doc, ctx)
for i, req in enumerate(reqs):
requests[req.type].append(req)
# i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.type].append((i, task_name, doc, doc_id))
# all responses for each (task, doc)
process_res_queue = collections.defaultdict(list)
# execute each type of request
for reqtype, reqs in requests.items():
# TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a bandaid
# solution. we could also implement some kind of autogrouping here; they should end up next to each other.
resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
process_res_queue[(task_name, doc_id)].append((i, resp))
vals = collections.defaultdict(list)
# unpack results and sort back in order and return control to Task
for (task_name, doc_id), requests in process_res_queue.items():
requests.sort(key=lambda x: x[0])
requests = [x[1] for x in requests]
task = task_dict[task_name]
doc = docs[(task_name, doc_id)]
metrics = task.process_results(doc, requests)
for metric, value in metrics.items():
vals[(task_name, metric)].append(value)
# aggregate results
for (task_name, metric), items in vals.items():
task = task_dict[task_name]
results[task_name][metric] = task.aggregation()[metric](items)
results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit)
dumped = json.dumps(results, indent=2)
print(dumped)
......
import lm_eval.tasks as tasks
import lm_eval.models as models
import lm_eval.evaluator as evaluator
import pytest
# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, Task):
task_dict = tasks.get_task_dict([taskname])
lm = models.get_model('dummy')()
evaluator.evaluate(lm, task_dict, False, 0, 10)
\ No newline at end of file
import lm_eval.models as models
import lm_eval.base as base
def test_gpt2():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
(ll_dog, ig_dog), (ll_cat, ig_cat) = gpt2.loglikelihood([
('The quick brown fox jumps over the lazy', ' dog'),
('The quick brown fox jumps over the lazy', ' cat'),
])
assert ll_dog > ll_cat
assert not ig_cat
......@@ -8,10 +8,10 @@ import pytest
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_basic_interface(taskname, Task):
print('Evaluating task', taskname)
dl = Task.download
Task.download = MagicMock()
#dl = Task.download
#Task.download = MagicMock()
task = Task()
Task.download = dl
#Task.download = dl
assert task.has_training_docs() in [True, False]
assert task.has_validation_docs() in [True, False]
......@@ -46,6 +46,6 @@ def test_documents_and_requests(taskname, Task):
reqs = task.construct_requests(doc, txt)
# todo: mock lm by pluggin what's currently in main.py in here
# todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs:
assert isinstance(req, base.Request)
\ No newline at end of file
assert isinstance(req, base.Request)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment