Merge branch 'master' into researcher2

55e62507 · researcher2 · bb0eafbb · 26f0233f · 55e62507 · 55e62507
Commit 55e62507 authored Jan 31, 2022 by researcher2
20 changed files
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 setuptools.setup(
    name="lm_eval",
-    version="0.0.1",
+    version="0.1.0",
    author="Leo Gao",
    author_email="lg@eleuther.ai",
    description="A framework for evaluating autoregressive language models",
@@ -20,7 +20,7 @@ setuptools.setup(
    ],
    python_requires='>=3.6',
    install_requires=[
-        "black==20.8b1",
+        "black",
        "best_download>=0.0.6",
        "datasets==1.15.1",
        "click>=7.1",

--- a/tests/test_description_dict.py
+++ b/tests/test_description_dict.py
+import random
+import lm_eval.tasks
+import lm_eval.models
+def test_description_dict():
+    seed = 42
+    num_examples = 1
+    task_names = ["hellaswag", "winogrande"]
+    description_dict = {
+        "hellaswag": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
+        "winogrande": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
+    }
+    task_dict = lm_eval.tasks.get_task_dict(task_names)
+    for task_name, task in task_dict.items():
+        rnd = random.Random()
+        rnd.seed(seed)
+        if task.has_training_docs():
+            docs = task.training_docs()
+        elif set == "val" and task.has_validation_docs():
+            docs = task.validation_docs()
+        elif set == "test" and task.has_test_docs():
+            docs = task.test_docs()
+        description = (
+            description_dict[task_name]
+            if description_dict and task_name in description_dict
+            else ""
+        )
+        for _, doc in (
+            zip(range(num_examples), docs) if num_examples > 0 else enumerate(docs)
+        ):
+            ctx = task.fewshot_context(
+                doc=doc,
+                num_fewshot=1,
+                rnd=rnd,
+                description=description,
+            )
+            assert description in ctx
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -10,8 +10,8 @@ import pytest
 # TODO: more fine grained unit tests rather than this big honking integration
 # test once we break evaluator into smaller, more manageable pieces
-@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
+@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
-def test_evaluator(taskname, Task):
+def test_evaluator(taskname, task_class):
    task_dict = tasks.get_task_dict([taskname])
    os.system("rm test_cache.db")
@@ -19,7 +19,8 @@ def test_evaluator(taskname, Task):
    def ll_fn(reqs):
        for ctx, cont in reqs:
-            if len(ctx) == 0: continue
+            if len(ctx) == 0:
+                continue
            # space convention
            assert ctx[-1] != ' '
            assert cont[0] == ' ' or ctx[-1] == '\n'
@@ -47,8 +48,22 @@ def test_evaluator(taskname, Task):
    lm.loglikelihood_rolling = ll_perp_fn
    limit = 10
-    e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
+    e1 = evaluator.evaluate(
-    e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
+            lm=lm,
+            task_dict=task_dict,
+            num_fewshot=0,
+            limit=limit,
+            bootstrap_iters=10,
+            description_dict=None
+    )
+    e2 = evaluator.evaluate(
+            lm=lm,
+            task_dict=task_dict,
+            num_fewshot=0,
+            limit=limit,
+            bootstrap_iters=10,
+            description_dict=None
+    )
-    # check taht caching is working
+    # check that caching is working
    assert e1 == e2
--- a/tests/test_gpt3.py
+++ b/tests/test_gpt3.py
-import lm_eval.tasks as tasks
 import lm_eval.models as models
-import lm_eval.evaluator as evaluator
-import random
 import pytest
 import os
 import json
@@ -10,10 +7,11 @@ import mock
 import pickle
 import hashlib
-os.environ['OPENAI_API_SECRET_KEY'] = ""
+def mock_completion(**kwargs):
-def completion(**kwargs):
+    # Mock completion function
+    # Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
+    os.makedirs("tests/testdata", exist_ok=True)
    hash = hashlib.sha256(json.dumps(kwargs, sort_keys=True).encode('utf-8')).hexdigest()
    fname = f"tests/testdata/gpt3_test_{hash}.pkl"
@@ -21,16 +19,15 @@ def completion(**kwargs):
        with open(fname, 'rb') as fh:
            return pickle.load(fh)
    ret = openai.Completion.create(**kwargs)
+    ret.api_key = ""
    with open(fname, 'wb') as fh:
        pickle.dump(ret, fh)
    return ret
-os.makedirs("tests/testdata", exist_ok=True)
+@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
-@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
 def test_gpt3():
+    if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
    gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
    (ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals = gpt3.loglikelihood([
        ('The quick brown fox jumps over the lazy', ' dog'),
@@ -39,8 +36,8 @@ def test_gpt3():
        ('The quick brown fox jumps over the lazy', ', lazy fox'),
        ('The quick brown fox jumps over the lazy', ', lazy fox and they both fall to the ground'),
-        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""), 
+        ("""A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)"""),
-        ("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""), 
+        ("""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see § Terminology"""),
        ("""Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]"""), 
        ("""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function."""), 
        ("""MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]"""), 
@@ -69,15 +66,18 @@ def test_gpt3():
    print([x[0] for x in vals])
-    targets = [-34.85833048, -47.114367866, -45.43520782100001, -5.289627985, -133.96879783896998, -321.30299892039994, -658.0542459504098, -34.85833048, -7.5162964]
+    targets = [
+        -34.848301606999996, -47.148329679999996, -45.44380149599999, -5.285246016, -133.97821690686004,
+        -321.2616693239001, -658.0299524401041, -34.848301606999996, -7.525115,
+    ]
    for (pred, _), tgt in zip(vals, targets):
        assert pred == pytest.approx(tgt, rel=1e-3)
+@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
-@mock.patch("lm_eval.models.gpt3.oa_completion", new=completion)
 def test_gpt3_perplexity():
+    if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = ""
    gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
    perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
@@ -85,7 +85,9 @@ def test_gpt3_perplexity():
    assert perplexity == pytest.approx(tgt, rel=1e-3)
    # Hack: modify gpt3 to have shorter context length to induce rolling windows
-    gpt3.MAX_LENGTH = 5
+    with mock.patch.object(models.gpt3.GPT3LM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
-    perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
+        mock_max_length.return_value = 5
-    tgt = -101.93490880000002
+        gpt3 = models.get_model('gpt3').create_from_arg_string("engine=ada")
+        perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
+    tgt = -101.81967209999999
    assert perplexity == pytest.approx(tgt, rel=1e-3)
--- a/tests/test_models.py
+++ b/tests/test_models.py
 import pytest
+import unittest.mock as mock
 import lm_eval.models as models
@@ -38,22 +39,31 @@ def test_gpt2():
    assert gen == ', lazy fox and they both fall to the ground'
-    targets = [-61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188, -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281]
+    targets = [
+        -61.60536193847656, -56.57843780517578, -62.131004333496094, -9.799489974975586, -153.96334838867188,
+        -341.222900390625, -731.1475830078125, -61.60536193847656, -8.682319641113281
+    ]
    for (pred, _), tgt in zip(vals, targets):
        assert pred == pytest.approx(tgt, rel=1e-3)
 def test_gpt2_perplexity():
    gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
    perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
-    tgt = sum([-4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072, -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487])
+    tgt = sum([
+        -4.9599953, -8.069298, -8.308624, -10.178513, -8.906924, -1.9318912, -7.745445, -7.146077, -5.2072,
+        -3.5882986, -1.9957212, -8.044922, -0.20841774, -5.1096807, -0.099879116, -8.888423, -4.6180487,
+    ])
    assert perplexity == pytest.approx(tgt, rel=1e-3)
-    # Hack: modify gpt2 to have shorter context length to induce rolling windows
+    with mock.patch.object(models.gpt2.HFLM, 'max_length', new_callable=mock.PropertyMock) as mock_max_length:
-    gpt2.max_length = 5
+        mock_max_length.return_value = 5
-    perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
+        gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
-    tgt = sum([-4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891, -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813])
+        perplexity = gpt2.loglikelihood_rolling([(test_string,)])[0]
+    tgt = sum([
+        -4.96001, -8.069275, -8.308612, -10.178482, -8.90691, -4.037338, -8.09261, -11.662385, -10.206891,
+        -4.425003, -2.2563353, -7.909143, -1.9304147, -7.3610134, -2.3120654, -7.3229, -2.1643813,
+    ])
    assert perplexity == pytest.approx(tgt, rel=1e-3)
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -4,13 +4,13 @@ import pytest
 from itertools import islice
-@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
+@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
-def test_basic_interface(taskname, Task):
+def test_basic_interface(taskname, task_class):
    print('Evaluating task', taskname)
-    #dl = Task.download
+    # dl = task_class.download
-    #Task.download = MagicMock()
+    # task_class.download = MagicMock()
-    task = Task()
+    task = task_class()
-    #Task.download = dl
+    # task_class.download = dl
    assert task.has_training_docs() in [True, False]
    assert task.has_validation_docs() in [True, False]
@@ -20,18 +20,20 @@ def test_basic_interface(taskname, Task):
    assert isinstance(task.higher_is_better(), dict)
    assert task.aggregation().keys() == task.higher_is_better().keys()
-    for v in task.higher_is_better().values(): assert v in [True, False]
+    for v in task.higher_is_better().values():
+        assert v in [True, False]
    assert isinstance(task.VERSION, int)
    # test deterministic docs
    # (don't test train because it's slow)
-    task2 = Task()
+    task2 = task_class()
    limit = None
-    if taskname in ["triviaqa"]: limit = 10000
+    if taskname in ["triviaqa"] or taskname.startswith("pile_"):
+        limit = 10000
    if task.has_validation_docs():
        arr = list(islice(task.validation_docs(), limit))
        arr2 = list(islice(task2.validation_docs(), limit))
@@ -66,18 +68,20 @@ def test_basic_interface(taskname, Task):
        assert reqs == reqs2
-@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
+@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
-def test_documents_and_requests(taskname, Task):
+def test_documents_and_requests(taskname, task_class):
    print('Evaluating task', taskname)
-    task = Task()
+    task = task_class()
    fns = []
-    if task.has_training_docs(): fns.append(task.training_docs)
+    if task.has_training_docs():
-    if task.has_validation_docs(): fns.append(task.validation_docs)
+        fns.append(task.training_docs)
+    if task.has_validation_docs():
+        fns.append(task.validation_docs)
    # test doc might not have labels
-    #if task.has_test_docs(): fns.append(task.test_docs)
+    # if task.has_test_docs(): fns.append(task.test_docs)
    for fn in fns:
-        #print(list(islice(fn(), 10)))
+        # print(list(islice(fn(), 10)))
        for doc in islice(fn(), 10):
            txt = task.doc_to_text(doc)
@@ -95,7 +99,8 @@ def test_documents_and_requests(taskname, Task):
            reqs = task.construct_requests(doc, txt)
            # construct_requests can return just one request
-            if not isinstance(reqs, (list, tuple)): reqs = [reqs]
+            if not isinstance(reqs, (list, tuple)):
+                reqs = [reqs]
            # todo: mock lm after refactoring evaluator.py to not be a mess
            for req in reqs:

--- a/tests/test_version_stable.py
+++ b/tests/test_version_stable.py
@@ -25,6 +25,7 @@ def assert_target(name, ob):
        with open(fname, 'w') as fh:
            json.dump(ob, fh, sort_keys=True)
 def assert_target_hashed(name, ob):
    fname = f"tests/testdata/{name}"
    if os.path.exists(fname):
@@ -48,19 +49,20 @@ def flatten(d, parent_key='', sep='.'):
 # make sure eval results for a task version are stable
-@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
+@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
-def test_versions_stable(taskname, Task):
+def test_versions_stable(taskname, task_class):
    task_dict = tasks.get_task_dict([taskname])
    lm = models.get_model('dummy')()
    def ll_fn(reqs):
        for ctx, cont in reqs:
-            if len(ctx) == 0: continue
+            if len(ctx) == 0:
+                continue
            # space convention
            assert ctx[-1] != ' '
            assert cont[0] == ' ' or ctx[-1] == '\n'
-        assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood", reqs)
+        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood", reqs)
        res = []
        random.seed(42)
@@ -73,7 +75,7 @@ def test_versions_stable(taskname, Task):
        for string, in reqs:
            assert isinstance(string, str)
-        assert_target_hashed(f"{taskname}-v{Task.VERSION}-loglikelihood_rolling", reqs)
+        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-loglikelihood_rolling", reqs)
        res = []
        random.seed(42)
@@ -84,7 +86,7 @@ def test_versions_stable(taskname, Task):
    def greedy_until(reqs):
        res = []
-        assert_target_hashed(f"{taskname}-v{Task.VERSION}-greedy_until", reqs)
+        assert_target_hashed(f"{taskname}-v{task_class.VERSION}-greedy_until", reqs)
        for ctx, _ in reqs:
            res.append("lol")
@@ -97,5 +99,13 @@ def test_versions_stable(taskname, Task):
    lm.greedy_until = greedy_until
    limit = None
-    res = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
+    result = evaluator.evaluate(
-    assert_target(f"{taskname}-v{Task.VERSION}-res", res)
+            lm=lm,
+            task_dict=task_dict,
+            num_fewshot=0,
+            limit=limit,
+            bootstrap_iters=10,
+            description_dict=None
+    )
+    assert_target(f"{taskname}-v{task_class.VERSION}-res", result)
--- a/tests/testdata/boolq-v1-loglikelihood
+++ b/tests/testdata/boolq-v1-loglikelihood
+6577e0d88572772ef08e64f624c0e3df0953286ae1f118ccef15623b59ffeabf
\ No newline at end of file
--- a/tests/testdata/boolq-v1-res.json
+++ b/tests/testdata/boolq-v1-res.json
+{"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 1}}
\ No newline at end of file
--- a/tests/testdata/cb-v1-loglikelihood
+++ b/tests/testdata/cb-v1-loglikelihood
+77b11f4348eb8a7f57faf95c531fda01ab4bf0e729f91a82451ed8e71ec8e66d
\ No newline at end of file
--- a/tests/testdata/cb-v1-res.json
+++ b/tests/testdata/cb-v1-res.json
+{"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 1}}
\ No newline at end of file
--- a/tests/testdata/gpt3_test_0deb8e9bde8e8327bbc48157f638ff3ba06b0cd816dad2beb8ad90f7fbe795c7.pkl
+++ b/tests/testdata/gpt3_test_0deb8e9bde8e8327bbc48157f638ff3ba06b0cd816dad2beb8ad90f7fbe795c7.pkl
--- a/tests/testdata/gpt3_test_57ec3d53a1dca09a4d4eca161692ad3c5f42b1a033d1315ce096ff67eb45f4b8.pkl
+++ b/tests/testdata/gpt3_test_57ec3d53a1dca09a4d4eca161692ad3c5f42b1a033d1315ce096ff67eb45f4b8.pkl
--- a/tests/testdata/gpt3_test_6e1182575a66b5d7fd9cfd5276d4f77d00932dc587870352e881c10347e00bc5.pkl
+++ b/tests/testdata/gpt3_test_6e1182575a66b5d7fd9cfd5276d4f77d00932dc587870352e881c10347e00bc5.pkl
--- a/tests/testdata/gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl
+++ b/tests/testdata/gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl
--- a/tests/testdata/gpt3_test_823a1a729bdb9f91884b1b986b2fa400aabd8436224328b60fa2314d43e779d2.pkl
+++ b/tests/testdata/gpt3_test_823a1a729bdb9f91884b1b986b2fa400aabd8436224328b60fa2314d43e779d2.pkl
--- a/tests/testdata/gpt3_test_941d8b6f1eba82d9575bbdc7053ec97fc8d77844679199101d00f1096c133a83.pkl
+++ b/tests/testdata/gpt3_test_941d8b6f1eba82d9575bbdc7053ec97fc8d77844679199101d00f1096c133a83.pkl
--- a/tests/testdata/gpt3_test_b795dbbd09256ac8c903e9f7e6dd247aad0dd32cd17108f9e8fa628b9424b9da.pkl
+++ b/tests/testdata/gpt3_test_b795dbbd09256ac8c903e9f7e6dd247aad0dd32cd17108f9e8fa628b9424b9da.pkl
--- a/tests/testdata/gpt3_test_bb2cc49115e88788ed870ad0716eb00b280a885f91c7ed6e1e864435e5e2b6ac.pkl
+++ b/tests/testdata/gpt3_test_bb2cc49115e88788ed870ad0716eb00b280a885f91c7ed6e1e864435e5e2b6ac.pkl
--- a/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl
+++ b/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl