Commit 55e62507 authored by researcher2's avatar researcher2
Browse files

Merge branch 'master' into researcher2

parents bb0eafbb 26f0233f
...@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh: ...@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
setuptools.setup( setuptools.setup(
name="lm_eval", name="lm_eval",
version="0.0.1", version="0.1.0",
author="Leo Gao", author="Leo Gao",
author_email="lg@eleuther.ai", author_email="lg@eleuther.ai",
description="A framework for evaluating autoregressive language models", description="A framework for evaluating autoregressive language models",
...@@ -20,7 +20,7 @@ setuptools.setup( ...@@ -20,7 +20,7 @@ setuptools.setup(
], ],
python_requires='>=3.6', python_requires='>=3.6',
install_requires=[ install_requires=[
"black==20.8b1", "black",
"best_download>=0.0.6", "best_download>=0.0.6",
"datasets==1.15.1", "datasets==1.15.1",
"click>=7.1", "click>=7.1",
......
import random
import lm_eval.tasks
import lm_eval.models
def test_description_dict():
seed = 42
num_examples = 1
task_names = ["hellaswag", "winogrande"]
description_dict = {
"hellaswag": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
"winogrande": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
}
task_dict = lm_eval.tasks.get_task_dict(task_names)
for task_name, task in task_dict.items():
rnd = random.Random()
rnd.seed(seed)
if task.has_training_docs():
docs = task.training_docs()
elif set == "val" and task.has_validation_docs():
docs = task.validation_docs()
elif set == "test" and task.has_test_docs():
docs = task.test_docs()
description = (
description_dict[task_name]
if description_dict and task_name in description_dict
else ""
)
for _, doc in (
zip(range(num_examples), docs) if num_examples > 0 else enumerate(docs)
):
ctx = task.fewshot_context(
doc=doc,
num_fewshot=1,
rnd=rnd,
description=description,
)
assert description in ctx
...@@ -10,8 +10,8 @@ import pytest ...@@ -10,8 +10,8 @@ import pytest
# TODO: more fine grained unit tests rather than this big honking integration # TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces # test once we break evaluator into smaller, more manageable pieces
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, Task): def test_evaluator(taskname, task_class):
task_dict = tasks.get_task_dict([taskname]) task_dict = tasks.get_task_dict([taskname])
os.system("rm test_cache.db") os.system("rm test_cache.db")
...@@ -19,7 +19,8 @@ def test_evaluator(taskname, Task): ...@@ -19,7 +19,8 @@ def test_evaluator(taskname, Task):
def ll_fn(reqs): def ll_fn(reqs):
for ctx, cont in reqs: for ctx, cont in reqs:
if len(ctx) == 0: continue if len(ctx) == 0:
continue
# space convention # space convention
assert ctx[-1] != ' ' assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n' assert cont[0] == ' ' or ctx[-1] == '\n'
...@@ -47,8 +48,22 @@ def test_evaluator(taskname, Task): ...@@ -47,8 +48,22 @@ def test_evaluator(taskname, Task):
lm.loglikelihood_rolling = ll_perp_fn lm.loglikelihood_rolling = ll_perp_fn
limit = 10 limit = 10
e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10) e1 = evaluator.evaluate(
e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10) lm=lm,
task_dict=task_dict,
num_fewshot=0,
limit=limit,
bootstrap_iters=10,
description_dict=None
)
e2 = evaluator.evaluate(
lm=lm,
task_dict=task_dict,
num_fewshot=0,
limit=limit,
bootstrap_iters=10,
description_dict=None
)
# check taht caching is working # check that caching is working
assert e1 == e2 assert e1 == e2
import lm_eval.tasks as tasks
import lm_eval.models as models import lm_eval.models as models
import lm_eval.evaluator as evaluator
import random
import pytest import pytest
import os import os
import json import json
...@@ -10,10 +7,11 @@ import mock ...@@ -10,10 +7,11 @@ import mock
import pickle import pickle
import hashlib import hashlib
os.environ['OPENAI_API_SECRET_KEY'] = ""
def mock_completion(**kwargs):
def completion(**kwargs): # Mock completion function
# Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
os.makedirs("tests/testdata", exist_ok=True)
hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest() hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest()
fname = f"tests/testdata/gpt3_test_{hash}.pkl" fname = f"tests/testdata/gpt3_test_{hash}.pkl"
...@@ -21,16 +19,15 @@ def completion(**kwargs): ...@@ -21,16 +19,15 @@ def completion(**kwargs):
with open(fname, 'rb') as fh: with open(fname, 'rb') as fh:
return pickle.load(fh) return pickle.load(fh)
ret = openai.Completion.create(**kwargs) ret = openai.Completion.create(**kwargs)
ret.api_key = ""
with open(fname, 'wb') as fh: with open(fname, 'wb') as fh:
pickle.dump(ret, fh) pickle.dump(ret, fh)
return ret return ret
os.makedirs("tests/testdata", exist_ok=True) @mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
def test_gpt3(): def test_gpt3():
if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada") gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
(ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([ (ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([
('The quick brown fox jumps over the lazy', ' dog'), ('The quick brown fox jumps over the lazy', ' dog'),
...@@ -69,15 +66,18 @@ def test_gpt3(): ...@@ -69,15 +66,18 @@ def test_gpt3():
print([x[0] for x in vals]) print([x[0] for x in vals])
targets = [-34.85833048, -47.114367866, -45.43520782100001, -5.289627985, -133.96879783896998, -321.30299892039994, -658.0542459504098, -34.85833048, -7.5162964] targets = [
-34.848301606999996, -47.148329679999996, -45.44380149599999, -5.285246016, -133.97821690686004,
-321.2616693239001, -658.0299524401041, -34.848301606999996, -7.525115,
]
for (pred, _), tgt in zip(vals, targets): for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3) assert pred == pytest.approx(tgt, rel=1e-3)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
def test_gpt3_perplexity(): def test_gpt3_perplexity():
if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada") gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss." test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0] perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
...@@ -85,7 +85,9 @@ def test_gpt3_perplexity(): ...@@ -85,7 +85,9 @@ def test_gpt3_perplexity():
assert perplexity == pytest.approx(tgt, rel=1e-3) assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt3 to have shorter context length to induce rolling windows # Hack: modify gpt3 to have shorter context length to induce rolling windows
gpt3.MAX_LENGTH = 5 with mock.patch.object(models.gpt3.GPT3LM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
mock_max_length.return_value = 5
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0] perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
tgt = -101.93490880000002 tgt = -101.81967209999999
assert perplexity == pytest.approx(tgt, rel=1e-3) assert perplexity == pytest.approx(tgt, rel=1e-3)
import pytest import pytest
import unittest.mock as mock
import lm_eval.models as models import lm_eval.models as models
...@@ -38,22 +39,31 @@ def test_gpt2(): ...@@ -38,22 +39,31 @@ def test_gpt2():
assert gen == ', lazy fox and they both fall to the ground' assert gen == ', lazy fox and they both fall to the ground'
targets = [-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188, -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281] targets = [
-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188,
-341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281
]
for (pred, _), tgt in zip(vals, targets): for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3) assert pred == pytest.approx(tgt, rel=1e-3)
def test_gpt2_perplexity(): def test_gpt2_perplexity():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu") gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss." test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0] perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072, -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487]) tgt = sum([
-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072,
-3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487,
])
assert perplexity == pytest.approx(tgt, rel=1e-3) assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt2 to have shorter context length to induce rolling windows with mock.patch.object(models.gpt2.HFLM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
gpt2.max_length = 5 mock_max_length.return_value = 5
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0] perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891, -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813]) tgt = sum([
-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891,
-4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813,
])
assert perplexity == pytest.approx(tgt, rel=1e-3) assert perplexity == pytest.approx(tgt, rel=1e-3)
...@@ -4,13 +4,13 @@ import pytest ...@@ -4,13 +4,13 @@ import pytest
from itertools import islice from itertools import islice
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_basic_interface(taskname, Task): def test_basic_interface(taskname, task_class):
print('Evaluating task', taskname) print('Evaluating task', taskname)
#dl = Task.download # dl = task_class.download
#Task.download = MagicMock() # task_class.download = MagicMock()
task = Task() task = task_class()
#Task.download = dl # task_class.download = dl
assert task.has_training_docs() in [True, False] assert task.has_training_docs() in [True, False]
assert task.has_validation_docs() in [True, False] assert task.has_validation_docs() in [True, False]
...@@ -20,18 +20,20 @@ def test_basic_interface(taskname, Task): ...@@ -20,18 +20,20 @@ def test_basic_interface(taskname, Task):
assert isinstance(task.higher_is_better(), dict) assert isinstance(task.higher_is_better(), dict)
assert task.aggregation().keys() == task.higher_is_better().keys() assert task.aggregation().keys() == task.higher_is_better().keys()
for v in task.higher_is_better().values(): assert v in [True, False] for v in task.higher_is_better().values():
assert v in [True, False]
assert isinstance(task.VERSION, int) assert isinstance(task.VERSION, int)
# test deterministic docs # test deterministic docs
# (don't test train because it's slow) # (don't test train because it's slow)
task2 = Task() task2 = task_class()
limit = None limit = None
if taskname in ["triviaqa"]: limit = 10000 if taskname in ["triviaqa"] or taskname.startswith("pile_"):
limit = 10000
if task.has_validation_docs(): if task.has_validation_docs():
arr = list(islice(task.validation_docs(), limit)) arr = list(islice(task.validation_docs(), limit))
arr2 = list(islice(task2.validation_docs(), limit)) arr2 = list(islice(task2.validation_docs(), limit))
...@@ -66,18 +68,20 @@ def test_basic_interface(taskname, Task): ...@@ -66,18 +68,20 @@ def test_basic_interface(taskname, Task):
assert reqs == reqs2 assert reqs == reqs2
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_documents_and_requests(taskname, Task): def test_documents_and_requests(taskname, task_class):
print('Evaluating task', taskname) print('Evaluating task', taskname)
task = Task() task = task_class()
fns = [] fns = []
if task.has_training_docs(): fns.append(task.training_docs) if task.has_training_docs():
if task.has_validation_docs(): fns.append(task.validation_docs) fns.append(task.training_docs)
if task.has_validation_docs():
fns.append(task.validation_docs)
# test doc might not have labels # test doc might not have labels
#if task.has_test_docs(): fns.append(task.test_docs) # if task.has_test_docs(): fns.append(task.test_docs)
for fn in fns: for fn in fns:
#print(list(islice(fn(), 10))) # print(list(islice(fn(), 10)))
for doc in islice(fn(), 10): for doc in islice(fn(), 10):
txt = task.doc_to_text(doc) txt = task.doc_to_text(doc)
...@@ -95,7 +99,8 @@ def test_documents_and_requests(taskname, Task): ...@@ -95,7 +99,8 @@ def test_documents_and_requests(taskname, Task):
reqs = task.construct_requests(doc, txt) reqs = task.construct_requests(doc, txt)
# construct_requests can return just one request # construct_requests can return just one request
if not isinstance(reqs, (list, tuple)): reqs = [reqs] if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
# todo: mock lm after refactoring evaluator.py to not be a mess # todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs: for req in reqs:
......
...@@ -25,6 +25,7 @@ def assert_target(name, ob): ...@@ -25,6 +25,7 @@ def assert_target(name, ob):
with open(fname, 'w') as fh: with open(fname, 'w') as fh:
json.dump(ob, fh, sort_keys=True) json.dump(ob, fh, sort_keys=True)
def assert_target_hashed(name, ob): def assert_target_hashed(name, ob):
fname = f"tests/testdata/{name}" fname = f"tests/testdata/{name}"
if os.path.exists(fname): if os.path.exists(fname):
...@@ -48,19 +49,20 @@ def flatten(d, parent_key='', sep='.'): ...@@ -48,19 +49,20 @@ def flatten(d, parent_key='', sep='.'):
# make sure eval results for a task version are stable # make sure eval results for a task version are stable
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_versions_stable(taskname, Task): def test_versions_stable(taskname, task_class):
task_dict = tasks.get_task_dict([taskname]) task_dict = tasks.get_task_dict([taskname])
lm = models.get_model('dummy')() lm = models.get_model('dummy')()
def ll_fn(reqs): def ll_fn(reqs):
for ctx, cont in reqs: for ctx, cont in reqs:
if len(ctx) == 0: continue if len(ctx) == 0:
continue
# space convention # space convention
assert ctx[-1] != ' ' assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n' assert cont[0] == ' ' or ctx[-1] == '\n'
assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood", reqs) assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood", reqs)
res = [] res = []
random.seed(42) random.seed(42)
...@@ -73,7 +75,7 @@ def test_versions_stable(taskname, Task): ...@@ -73,7 +75,7 @@ def test_versions_stable(taskname, Task):
for string, in reqs: for string, in reqs:
assert isinstance(string, str) assert isinstance(string, str)
assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood_rolling", reqs) assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood_rolling", reqs)
res = [] res = []
random.seed(42) random.seed(42)
...@@ -84,7 +86,7 @@ def test_versions_stable(taskname, Task): ...@@ -84,7 +86,7 @@ def test_versions_stable(taskname, Task):
def greedy_until(reqs): def greedy_until(reqs):
res = [] res = []
assert_target_hashed(f"{taskname}-v{Task.VERSION}-greedy_until", reqs) assert_target_hashed(f"{taskname}-v{task_class.VERSION}-greedy_until", reqs)
for ctx, _ in reqs: for ctx, _ in reqs:
res.append("lol") res.append("lol")
...@@ -97,5 +99,13 @@ def test_versions_stable(taskname, Task): ...@@ -97,5 +99,13 @@ def test_versions_stable(taskname, Task):
lm.greedy_until = greedy_until lm.greedy_until = greedy_until
limit = None limit = None
res = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10) result = evaluator.evaluate(
assert_target(f"{taskname}-v{Task.VERSION}-res", res) lm=lm,
task_dict=task_dict,
num_fewshot=0,
limit=limit,
bootstrap_iters=10,
description_dict=None
)
assert_target(f"{taskname}-v{task_class.VERSION}-res", result)
6577e0d88572772ef08e64f624c0e3df0953286ae1f118ccef15623b59ffeabf
\ No newline at end of file
{"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 1}}
\ No newline at end of file
77b11f4348eb8a7f57faf95c531fda01ab4bf0e729f91a82451ed8e71ec8e66d
\ No newline at end of file
{"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 1}}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment