update GPT3 test data and more docs

53f6bc34 · Jason Phang · 76ebb792 · 53f6bc34 · 53f6bc34 · 53f6bc34
Commit 53f6bc34 authored Nov 23, 2021 by Jason Phang
13 changed files
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -11,7 +11,7 @@ import numpy as np
 def simple_evaluate(model, model_args, task_names,
                    num_fewshot=0, batch_size=None, device=None,
                    no_cache=False, limit=None, bootstrap_iters=100000):
-    """
+    """Instantiate and evaluate a model on a list of tasks.

    :param model: str
        Name of model, see lm_eval.models.get_model
@@ -24,7 +24,7 @@ def simple_evaluate(model, model_args, task_names,
    :param batch_size: int, optional
        Batch size for model
    :param device: str, optional
-
+        PyTorch device (e.g. "cpu" or "cuda:0") for running models
    :param no_cache: bool
        Whether or not
    :param limit: int, optional
@@ -32,6 +32,7 @@ def simple_evaluate(model, model_args, task_names,
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
    :return
+        Dictionary of results
    """
    random.seed(1234)
    np.random.seed(1234)
@@ -64,6 +65,23 @@ def simple_evaluate(model, model_args, task_names,


 def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000):
+    """Instantiate and evaluate a model on a list of tasks.
+
+    :param lm: obj
+        Language Model
+    :param task_dict: dict[str, Task]
+        Dictionary of tasks
+    :param provide_description: bool
+        NOT IMPLEMENTED
+    :param num_fewshot: int
+        Number of examples in few-shot context
+    :param limit: int, optional
+        Limit the number of examples per task (only use this for testing)
+    :param bootstrap_iters:
+        Number of iterations for bootstrap statistics
+    :return
+        Dictionary of results
+    """
    # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces

    # TODO: todo: implement proper description-providing system

--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
@@ -8,6 +8,18 @@ import time


 def get_result(response, ctxlen):
+    """Process results from OpenAI API response.
+
+    :param response: dict
+        OpenAI API Response
+    :param ctxlen: int
+        Length of context (so we can slice them away and only keep the predictions)
+    :return:
+        continuation_logprobs: np.array
+            Log probabilities of continuation tokens
+        is_greedy: bool
+            whether argmax matches given continuation exactly
+    """
    is_greedy = True
    logprobs = response["logprobs"]["token_logprobs"]
    continuation_logprobs = sum(logprobs[ctxlen:])

--- a/tests/test_gpt3.py
+++ b/tests/test_gpt3.py
@@ -19,6 +19,7 @@ def mock_completion(**kwargs):
        with open(fname, 'rb') as fh:
            return pickle.load(fh)
    ret = openai.Completion.create(**kwargs)
+    ret.api_key = ""
    with open(fname, 'wb') as fh:
        pickle.dump(ret, fh)
    return ret
@@ -65,8 +66,8 @@ def test_gpt3():
    print([x[0] for x in vals])

    targets = [
-        -34.85833048, -47.114367866, -45.43520782100001, -5.289627985, -133.96879783896998, -321.30299892039994,
-        -658.0542459504098, -34.85833048, -7.5162964
+        -34.848301606999996, -47.148329679999996, -45.44380149599999, -5.285246016, -133.97821690686004,
+        -321.2616693239001, -658.0299524401041, -34.848301606999996, -7.525115,
    ]

    for (pred, _), tgt in zip(vals, targets):

--- a/tests/testdata/gpt3_test_0deb8e9bde8e8327bbc48157f638ff3ba06b0cd816dad2beb8ad90f7fbe795c7.pkl
+++ b/tests/testdata/gpt3_test_0deb8e9bde8e8327bbc48157f638ff3ba06b0cd816dad2beb8ad90f7fbe795c7.pkl
--- a/tests/testdata/gpt3_test_57ec3d53a1dca09a4d4eca161692ad3c5f42b1a033d1315ce096ff67eb45f4b8.pkl
+++ b/tests/testdata/gpt3_test_57ec3d53a1dca09a4d4eca161692ad3c5f42b1a033d1315ce096ff67eb45f4b8.pkl
--- a/tests/testdata/gpt3_test_6e1182575a66b5d7fd9cfd5276d4f77d00932dc587870352e881c10347e00bc5.pkl
+++ b/tests/testdata/gpt3_test_6e1182575a66b5d7fd9cfd5276d4f77d00932dc587870352e881c10347e00bc5.pkl
--- a/tests/testdata/gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl
+++ b/tests/testdata/gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl
--- a/tests/testdata/gpt3_test_823a1a729bdb9f91884b1b986b2fa400aabd8436224328b60fa2314d43e779d2.pkl
+++ b/tests/testdata/gpt3_test_823a1a729bdb9f91884b1b986b2fa400aabd8436224328b60fa2314d43e779d2.pkl
--- a/tests/testdata/gpt3_test_941d8b6f1eba82d9575bbdc7053ec97fc8d77844679199101d00f1096c133a83.pkl
+++ b/tests/testdata/gpt3_test_941d8b6f1eba82d9575bbdc7053ec97fc8d77844679199101d00f1096c133a83.pkl
--- a/tests/testdata/gpt3_test_b795dbbd09256ac8c903e9f7e6dd247aad0dd32cd17108f9e8fa628b9424b9da.pkl
+++ b/tests/testdata/gpt3_test_b795dbbd09256ac8c903e9f7e6dd247aad0dd32cd17108f9e8fa628b9424b9da.pkl
--- a/tests/testdata/gpt3_test_bb2cc49115e88788ed870ad0716eb00b280a885f91c7ed6e1e864435e5e2b6ac.pkl
+++ b/tests/testdata/gpt3_test_bb2cc49115e88788ed870ad0716eb00b280a885f91c7ed6e1e864435e5e2b6ac.pkl
--- a/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl
+++ b/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl
--- a/tests/testdata/gpt3_test_f307d52964c295e2005c5e782b688c24388e0cecadf29f1e6fc7f394236ea9c0.pkl
+++ b/tests/testdata/gpt3_test_f307d52964c295e2005c5e782b688c24388e0cecadf29f1e6fc7f394236ea9c0.pkl