Commit 778e0f91 authored by Leo Gao's avatar Leo Gao
Browse files

Massive refactor

- Extract evaluator (still needs work to clean up)
- Add tests for evaluator
- Fix all the things that break on the new tests
- Misc cleanup
parent 52c1c56a
...@@ -200,7 +200,8 @@ def f1_score(items): ...@@ -200,7 +200,8 @@ def f1_score(items):
golds = unzipped_list[0] golds = unzipped_list[0]
preds = unzipped_list[1] preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds) fscore = sklearn.metrics.f1_score(golds, preds)
return max(fscore)
return np.max(fscore)
def acc_all(items): def acc_all(items):
......
import collections
import itertools
def evaluate(lm, task_dict, provide_description, num_fewshot, limit):
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
results = collections.defaultdict(dict)
requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list)
# if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
# we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
# (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
# TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
docs = {}
# get lists of each type of requeste
for task_name, task in task_dict_items:
#default to validation doc, fall back to test doc if validation unavailable
# TODO: the val-fallback-to-test system isn't final, we should revisit it at some point
if task.has_validation_docs():
task_doc_func = task.validation_docs
elif task.has_test_docs():
task_doc_func = task.test_docs
for doc_id, doc in enumerate(itertools.islice(task_doc_func(), 0, limit)):
docs[(task_name, doc_id)] = doc
ctx = task.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
reqs = task.construct_requests(doc, ctx)
for i, req in enumerate(reqs):
requests[req.type].append(req)
# i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.type].append((i, task_name, doc, doc_id))
# all responses for each (task, doc)
process_res_queue = collections.defaultdict(list)
# execute each type of request
for reqtype, reqs in requests.items():
# TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a bandaid
# solution. we could also implement some kind of autogrouping here; they should end up next to each other.
resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
process_res_queue[(task_name, doc_id)].append((i, resp))
vals = collections.defaultdict(list)
# unpack results and sort back in order and return control to Task
for (task_name, doc_id), requests in process_res_queue.items():
requests.sort(key=lambda x: x[0])
requests = [x[1] for x in requests]
task = task_dict[task_name]
doc = docs[(task_name, doc_id)]
metrics = task.process_results(doc, requests)
for metric, value in metrics.items():
vals[(task_name, metric)].append(value)
# aggregate results
for (task_name, metric), items in vals.items():
task = task_dict[task_name]
results[task_name][metric] = task.aggregation()[metric](items)
return results
\ No newline at end of file
from . import gpt2 from . import gpt2
from . import gpt3 from . import gpt3
from . import dummy
MODEL_REGISTRY = { MODEL_REGISTRY = {
"gpt2": gpt2.GPT2LM, "gpt2": gpt2.GPT2LM,
"gpt3": gpt3.GPT3LM, "gpt3": gpt3.GPT3LM,
"dummy": dummy.DummyLM,
} }
......
...@@ -12,7 +12,6 @@ def get_result(response, ctxlen): ...@@ -12,7 +12,6 @@ def get_result(response, ctxlen):
for i in range(ctxlen, len(response["logprobs"]["tokens"])): for i in range(ctxlen, len(response["logprobs"]["tokens"])):
token = response["logprobs"]["tokens"][i] token = response["logprobs"]["tokens"][i]
print('TOK', token, response["logprobs"]["top_logprobs"][i])
top_tokens = response["logprobs"]["top_logprobs"][i] top_tokens = response["logprobs"]["top_logprobs"][i]
top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x]) top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
if top_token != token: if top_token != token:
......
...@@ -37,7 +37,7 @@ TASK_REGISTRY = { ...@@ -37,7 +37,7 @@ TASK_REGISTRY = {
"cb": superglue.CommitmentBank, "cb": superglue.CommitmentBank,
"copa": superglue.Copa, "copa": superglue.Copa,
"multirc": superglue.MultiRC, "multirc": superglue.MultiRC,
"record": superglue.ReCoRD, #"record": superglue.ReCoRD,
"wic": superglue.WordsInContext, "wic": superglue.WordsInContext,
"wsc": superglue.SGWinogradSchemaChallenge, "wsc": superglue.SGWinogradSchemaChallenge,
......
...@@ -52,8 +52,6 @@ class LAMBADA(Task): ...@@ -52,8 +52,6 @@ class LAMBADA(Task):
def process_results(self, doc, results): def process_results(self, doc, results):
ll, is_greedy = results ll, is_greedy = results
print(ll)
return { return {
'perplexity': ll, 'perplexity': ll,
'accuracy': int(is_greedy) 'accuracy': int(is_greedy)
......
...@@ -261,7 +261,7 @@ class ReCoRD(HFTask): ...@@ -261,7 +261,7 @@ class ReCoRD(HFTask):
return True return True
def has_test_docs(self): def has_test_docs(self):
return True return False
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out actual description # TODO: figure out actual description
...@@ -322,6 +322,7 @@ class ReCoRD(HFTask): ...@@ -322,6 +322,7 @@ class ReCoRD(HFTask):
# - Evaluate the accuracy and token F1 PER EXAMPLE # - Evaluate the accuracy and token F1 PER EXAMPLE
# - Average over all examples # - Average over all examples
max_idx = np.argmax(np.array(results)) max_idx = np.argmax(np.array(results))
prediction = doc["entities"][max_idx] prediction = doc["entities"][max_idx]
gold_label_set = list(set(doc["answers"])) gold_label_set = list(set(doc["answers"]))
f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set) f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
......
...@@ -5,7 +5,7 @@ import random ...@@ -5,7 +5,7 @@ import random
import itertools import itertools
import collections import collections
from lm_eval import models, tasks from lm_eval import models, tasks, evaluator
def parse_args(): def parse_args():
...@@ -32,81 +32,7 @@ def main(): ...@@ -32,81 +32,7 @@ def main():
task_names = args.tasks.split(",") task_names = args.tasks.split(",")
task_dict = tasks.get_task_dict(task_names) task_dict = tasks.get_task_dict(task_names)
task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())] results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit)
results = collections.defaultdict(dict)
requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list)
# if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
# we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
# (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
# TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
docs = {}
# get lists of each type of requeste
for task_name, task in task_dict_items:
#default to validation doc, fall back to test doc if validation unavailable
# TODO: the val-fallback-to-test system isn't final, we should revisit it at some point
if task.has_validation_docs():
task_doc_func = task.validation_docs
elif task.has_test_docs():
task_doc_func = task.test_docs
for doc_id, doc in enumerate(itertools.islice(task_doc_func(), 0, args.limit)):
docs[(task_name, doc_id)] = doc
ctx = task.fewshot_context(
doc=doc,
provide_description=args.provide_description,
num_fewshot=args.num_fewshot,
)
reqs = task.construct_requests(doc, ctx)
for i, req in enumerate(reqs):
requests[req.type].append(req)
# i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.type].append((i, task_name, doc, doc_id))
# all responses for each (task, doc)
process_res_queue = collections.defaultdict(list)
# execute each type of request
for reqtype, reqs in requests.items():
# TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a bandaid
# solution. we could also implement some kind of autogrouping here; they should end up next to each other.
resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
process_res_queue[(task_name, doc_id)].append((i, resp))
vals = collections.defaultdict(list)
# unpack results and sort back in order and return control to Task
for (task_name, doc_id), requests in process_res_queue.items():
requests.sort(key=lambda x: x[0])
requests = [x[1] for x in requests]
task = task_dict[task_name]
doc = docs[(task_name, doc_id)]
metrics = task.process_results(doc, requests)
for metric, value in metrics.items():
vals[(task_name, metric)].append(value)
# aggregate results
for (task_name, metric), items in vals.items():
task = task_dict[task_name]
results[task_name][metric] = task.aggregation()[metric](items)
dumped = json.dumps(results, indent=2) dumped = json.dumps(results, indent=2)
print(dumped) print(dumped)
......
import lm_eval.tasks as tasks
import lm_eval.models as models
import lm_eval.evaluator as evaluator
import pytest
# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, Task):
task_dict = tasks.get_task_dict([taskname])
lm = models.get_model('dummy')()
evaluator.evaluate(lm, task_dict, False, 0, 10)
\ No newline at end of file
...@@ -2,7 +2,7 @@ import lm_eval.models as models ...@@ -2,7 +2,7 @@ import lm_eval.models as models
import lm_eval.base as base import lm_eval.base as base
def test_gpt2(): def test_gpt2():
gpt2 = models.get_model('gpt2')(device="cpu") gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
(ll_dog, ig_dog), (ll_cat, ig_cat) = gpt2.loglikelihood([ (ll_dog, ig_dog), (ll_cat, ig_cat) = gpt2.loglikelihood([
('The quick brown fox jumps over the lazy', ' dog'), ('The quick brown fox jumps over the lazy', ' dog'),
('The quick brown fox jumps over the lazy', ' cat'), ('The quick brown fox jumps over the lazy', ' cat'),
......
...@@ -46,6 +46,6 @@ def test_documents_and_requests(taskname, Task): ...@@ -46,6 +46,6 @@ def test_documents_and_requests(taskname, Task):
reqs = task.construct_requests(doc, txt) reqs = task.construct_requests(doc, txt)
# todo: mock lm by pluggin what's currently in main.py in here # todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs: for req in reqs:
assert isinstance(req, base.Request) assert isinstance(req, base.Request)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment