Commit 55e62507 authored by researcher2's avatar researcher2
Browse files

Merge branch 'master' into researcher2

parents bb0eafbb 26f0233f
...@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh: ...@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
setuptools.setup( setuptools.setup(
name="lm_eval", name="lm_eval",
version="0.0.1", version="0.1.0",
author="Leo Gao", author="Leo Gao",
author_email="lg@eleuther.ai", author_email="lg@eleuther.ai",
description="A framework for evaluating autoregressive language models", description="A framework for evaluating autoregressive language models",
...@@ -20,7 +20,7 @@ setuptools.setup( ...@@ -20,7 +20,7 @@ setuptools.setup(
], ],
python_requires='>=3.6', python_requires='>=3.6',
install_requires=[ install_requires=[
"black==20.8b1", "black",
"best_download>=0.0.6", "best_download>=0.0.6",
"datasets==1.15.1", "datasets==1.15.1",
"click>=7.1", "click>=7.1",
......
import random
import lm_eval.tasks
import lm_eval.models
def test_description_dict():
seed = 42
num_examples = 1
task_names = ["hellaswag", "winogrande"]
description_dict = {
"hellaswag": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
"winogrande": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
}
task_dict = lm_eval.tasks.get_task_dict(task_names)
for task_name, task in task_dict.items():
rnd = random.Random()
rnd.seed(seed)
if task.has_training_docs():
docs = task.training_docs()
elif set == "val" and task.has_validation_docs():
docs = task.validation_docs()
elif set == "test" and task.has_test_docs():
docs = task.test_docs()
description = (
description_dict[task_name]
if description_dict and task_name in description_dict
else ""
)
for _, doc in (
zip(range(num_examples), docs) if num_examples > 0 else enumerate(docs)
):
ctx = task.fewshot_context(
doc=doc,
num_fewshot=1,
rnd=rnd,
description=description,
)
assert description in ctx
...@@ -10,8 +10,8 @@ import pytest ...@@ -10,8 +10,8 @@ import pytest
# TODO: more fine grained unit tests rather than this big honking integration # TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces # test once we break evaluator into smaller, more manageable pieces
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, Task): def test_evaluator(taskname, task_class):
task_dict = tasks.get_task_dict([taskname]) task_dict = tasks.get_task_dict([taskname])
os.system("rm test_cache.db") os.system("rm test_cache.db")
...@@ -19,7 +19,8 @@ def test_evaluator(taskname, Task): ...@@ -19,7 +19,8 @@ def test_evaluator(taskname, Task):
def ll_fn(reqs): def ll_fn(reqs):
for ctx, cont in reqs: for ctx, cont in reqs:
if len(ctx) == 0: continue if len(ctx) == 0:
continue
# space convention # space convention
assert ctx[-1] != ' ' assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n' assert cont[0] == ' ' or ctx[-1] == '\n'
...@@ -47,8 +48,22 @@ def test_evaluator(taskname, Task): ...@@ -47,8 +48,22 @@ def test_evaluator(taskname, Task):
lm.loglikelihood_rolling = ll_perp_fn lm.loglikelihood_rolling = ll_perp_fn
limit = 10 limit = 10
e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10) e1 = evaluator.evaluate(
e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10) lm=lm,
task_dict=task_dict,
num_fewshot=0,
limit=limit,
bootstrap_iters=10,
description_dict=None
)
e2 = evaluator.evaluate(
lm=lm,
task_dict=task_dict,
num_fewshot=0,
limit=limit,
bootstrap_iters=10,
description_dict=None
)
# check taht caching is working # check that caching is working
assert e1 == e2 assert e1 == e2
import lm_eval.tasks as tasks
import lm_eval.models as models import lm_eval.models as models
import lm_eval.evaluator as evaluator
import random
import pytest import pytest
import os import os
import json import json
...@@ -10,10 +7,11 @@ import mock ...@@ -10,10 +7,11 @@ import mock
import pickle import pickle
import hashlib import hashlib
os.environ['OPENAI_API_SECRET_KEY'] = ""
def mock_completion(**kwargs):
def completion(**kwargs): # Mock completion function
# Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
os.makedirs("tests/testdata", exist_ok=True)
hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest() hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest()
fname = f"tests/testdata/gpt3_test_{hash}.pkl" fname = f"tests/testdata/gpt3_test_{hash}.pkl"
...@@ -21,16 +19,15 @@ def completion(**kwargs): ...@@ -21,16 +19,15 @@ def completion(**kwargs):
with open(fname, 'rb') as fh: with open(fname, 'rb') as fh:
return pickle.load(fh) return pickle.load(fh)
ret = openai.Completion.create(**kwargs) ret = openai.Completion.create(**kwargs)
ret.api_key = ""
with open(fname, 'wb') as fh: with open(fname, 'wb') as fh:
pickle.dump(ret, fh) pickle.dump(ret, fh)
return ret return ret
os.makedirs("tests/testdata", exist_ok=True) @mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
def test_gpt3(): def test_gpt3():
if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada") gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
(ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([ (ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([
('The quick brown fox jumps over the lazy', ' dog'), ('The quick brown fox jumps over the lazy', ' dog'),
...@@ -39,8 +36,8 @@ def test_gpt3(): ...@@ -39,8 +36,8 @@ def test_gpt3():
('The quick brown fox jumps over the lazy', ', lazy fox'), ('The quick brown fox jumps over the lazy', ', lazy fox'),
('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'), ('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'),
("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""),
("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""), ("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""),
("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""), ("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""),
("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""), ("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""),
("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""), ("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""),
...@@ -69,15 +66,18 @@ def test_gpt3(): ...@@ -69,15 +66,18 @@ def test_gpt3():
print([x[0] for x in vals]) print([x[0] for x in vals])
targets = [-34.85833048, -47.114367866, -45.43520782100001, -5.289627985, -133.96879783896998, -321.30299892039994, -658.0542459504098, -34.85833048, -7.5162964] targets = [
-34.848301606999996, -47.148329679999996, -45.44380149599999, -5.285246016, -133.97821690686004,
-321.2616693239001, -658.0299524401041, -34.848301606999996, -7.525115,
]
for (pred, _), tgt in zip(vals, targets): for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3) assert pred == pytest.approx(tgt, rel=1e-3)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
def test_gpt3_perplexity(): def test_gpt3_perplexity():
if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada") gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss." test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0] perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
...@@ -85,7 +85,9 @@ def test_gpt3_perplexity(): ...@@ -85,7 +85,9 @@ def test_gpt3_perplexity():
assert perplexity == pytest.approx(tgt, rel=1e-3) assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt3 to have shorter context length to induce rolling windows # Hack: modify gpt3 to have shorter context length to induce rolling windows
gpt3.MAX_LENGTH = 5 with mock.patch.object(models.gpt3.GPT3LM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0] mock_max_length.return_value = 5
tgt = -101.93490880000002 gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
tgt = -101.81967209999999
assert perplexity == pytest.approx(tgt, rel=1e-3) assert perplexity == pytest.approx(tgt, rel=1e-3)
import pytest import pytest
import unittest.mock as mock
import lm_eval.models as models import lm_eval.models as models
...@@ -38,22 +39,31 @@ def test_gpt2(): ...@@ -38,22 +39,31 @@ def test_gpt2():
assert gen == ', lazy fox and they both fall to the ground' assert gen == ', lazy fox and they both fall to the ground'
targets = [-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188, -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281] targets = [
-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188,
-341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281
]
for (pred, _), tgt in zip(vals, targets): for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3) assert pred == pytest.approx(tgt, rel=1e-3)
def test_gpt2_perplexity(): def test_gpt2_perplexity():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu") gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss." test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0] perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072, -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487]) tgt = sum([
-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072,
-3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487,
])
assert perplexity == pytest.approx(tgt, rel=1e-3) assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt2 to have shorter context length to induce rolling windows with mock.patch.object(models.gpt2.HFLM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
gpt2.max_length = 5 mock_max_length.return_value = 5
perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0] gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
tgt = sum([-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891, -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813]) perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
tgt = sum([
-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891,
-4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813,
])
assert perplexity == pytest.approx(tgt, rel=1e-3) assert perplexity == pytest.approx(tgt, rel=1e-3)
...@@ -4,13 +4,13 @@ import pytest ...@@ -4,13 +4,13 @@ import pytest
from itertools import islice from itertools import islice
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_basic_interface(taskname, Task): def test_basic_interface(taskname, task_class):
print('Evaluating task', taskname) print('Evaluating task', taskname)
#dl = Task.download # dl = task_class.download
#Task.download = MagicMock() # task_class.download = MagicMock()
task = Task() task = task_class()
#Task.download = dl # task_class.download = dl
assert task.has_training_docs() in [True, False] assert task.has_training_docs() in [True, False]
assert task.has_validation_docs() in [True, False] assert task.has_validation_docs() in [True, False]
...@@ -20,18 +20,20 @@ def test_basic_interface(taskname, Task): ...@@ -20,18 +20,20 @@ def test_basic_interface(taskname, Task):
assert isinstance(task.higher_is_better(), dict) assert isinstance(task.higher_is_better(), dict)
assert task.aggregation().keys() == task.higher_is_better().keys() assert task.aggregation().keys() == task.higher_is_better().keys()
for v in task.higher_is_better().values(): assert v in [True, False] for v in task.higher_is_better().values():
assert v in [True, False]
assert isinstance(task.VERSION, int) assert isinstance(task.VERSION, int)
# test deterministic docs # test deterministic docs
# (don't test train because it's slow) # (don't test train because it's slow)
task2 = Task() task2 = task_class()
limit = None limit = None
if taskname in ["triviaqa"]: limit = 10000 if taskname in ["triviaqa"] or taskname.startswith("pile_"):
limit = 10000
if task.has_validation_docs(): if task.has_validation_docs():
arr = list(islice(task.validation_docs(), limit)) arr = list(islice(task.validation_docs(), limit))
arr2 = list(islice(task2.validation_docs(), limit)) arr2 = list(islice(task2.validation_docs(), limit))
...@@ -66,18 +68,20 @@ def test_basic_interface(taskname, Task): ...@@ -66,18 +68,20 @@ def test_basic_interface(taskname, Task):
assert reqs == reqs2 assert reqs == reqs2
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_documents_and_requests(taskname, Task): def test_documents_and_requests(taskname, task_class):
print('Evaluating task', taskname) print('Evaluating task', taskname)
task = Task() task = task_class()
fns = [] fns = []
if task.has_training_docs(): fns.append(task.training_docs) if task.has_training_docs():
if task.has_validation_docs(): fns.append(task.validation_docs) fns.append(task.training_docs)
if task.has_validation_docs():
fns.append(task.validation_docs)
# test doc might not have labels # test doc might not have labels
#if task.has_test_docs(): fns.append(task.test_docs) # if task.has_test_docs(): fns.append(task.test_docs)
for fn in fns: for fn in fns:
#print(list(islice(fn(), 10))) # print(list(islice(fn(), 10)))
for doc in islice(fn(), 10): for doc in islice(fn(), 10):
txt = task.doc_to_text(doc) txt = task.doc_to_text(doc)
...@@ -95,7 +99,8 @@ def test_documents_and_requests(taskname, Task): ...@@ -95,7 +99,8 @@ def test_documents_and_requests(taskname, Task):
reqs = task.construct_requests(doc, txt) reqs = task.construct_requests(doc, txt)
# construct_requests can return just one request # construct_requests can return just one request
if not isinstance(reqs, (list, tuple)): reqs = [reqs] if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
# todo: mock lm after refactoring evaluator.py to not be a mess # todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs: for req in reqs:
......
...@@ -25,6 +25,7 @@ def assert_target(name, ob): ...@@ -25,6 +25,7 @@ def assert_target(name, ob):
with open(fname, 'w') as fh: with open(fname, 'w') as fh:
json.dump(ob, fh, sort_keys=True) json.dump(ob, fh, sort_keys=True)
def assert_target_hashed(name, ob): def assert_target_hashed(name, ob):
fname = f"tests/testdata/{name}" fname = f"tests/testdata/{name}"
if os.path.exists(fname): if os.path.exists(fname):
...@@ -48,19 +49,20 @@ def flatten(d, parent_key='', sep='.'): ...@@ -48,19 +49,20 @@ def flatten(d, parent_key='', sep='.'):
# make sure eval results for a task version are stable # make sure eval results for a task version are stable
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_versions_stable(taskname, Task): def test_versions_stable(taskname, task_class):
task_dict = tasks.get_task_dict([taskname]) task_dict = tasks.get_task_dict([taskname])
lm = models.get_model('dummy')() lm = models.get_model('dummy')()
def ll_fn(reqs): def ll_fn(reqs):
for ctx, cont in reqs: for ctx, cont in reqs:
if len(ctx) == 0: continue if len(ctx) == 0:
continue
# space convention # space convention
assert ctx[-1] != ' ' assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n' assert cont[0] == ' ' or ctx[-1] == '\n'
assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood", reqs) assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood", reqs)
res = [] res = []
random.seed(42) random.seed(42)
...@@ -73,7 +75,7 @@ def test_versions_stable(taskname, Task): ...@@ -73,7 +75,7 @@ def test_versions_stable(taskname, Task):
for string, in reqs: for string, in reqs:
assert isinstance(string, str) assert isinstance(string, str)
assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood_rolling", reqs) assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood_rolling", reqs)
res = [] res = []
random.seed(42) random.seed(42)
...@@ -84,7 +86,7 @@ def test_versions_stable(taskname, Task): ...@@ -84,7 +86,7 @@ def test_versions_stable(taskname, Task):
def greedy_until(reqs): def greedy_until(reqs):
res = [] res = []
assert_target_hashed(f"{taskname}-v{Task.VERSION}-greedy_until", reqs) assert_target_hashed(f"{taskname}-v{task_class.VERSION}-greedy_until", reqs)
for ctx, _ in reqs: for ctx, _ in reqs:
res.append("lol") res.append("lol")
...@@ -97,5 +99,13 @@ def test_versions_stable(taskname, Task): ...@@ -97,5 +99,13 @@ def test_versions_stable(taskname, Task):
lm.greedy_until = greedy_until lm.greedy_until = greedy_until
limit = None limit = None
res = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10) result = evaluator.evaluate(
assert_target(f"{taskname}-v{Task.VERSION}-res", res) lm=lm,
task_dict=task_dict,
num_fewshot=0,
limit=limit,
bootstrap_iters=10,
description_dict=None
)
assert_target(f"{taskname}-v{task_class.VERSION}-res", result)
6577e0d88572772ef08e64f624c0e3df0953286ae1f118ccef15623b59ffeabf
\ No newline at end of file
{"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 1}}
\ No newline at end of file
77b11f4348eb8a7f57faf95c531fda01ab4bf0e729f91a82451ed8e71ec8e66d
\ No newline at end of file
{"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 1}}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment