test pythia-70m and pytest only test_evaluator.py

4225df50 · baberabb · d9b547b7 · 4225df50 · 4225df50
Commit 4225df50 authored Jul 14, 2023 by baberabb
Hide whitespace changes
Inline Side-by-side

Showing with 59 additions and 47 deletions

.github/workflows/python-app.yml .github/workflows/python-app.yml +4 -4

tests/test_evaluator.py tests/test_evaluator.py +55 -43

No files found.
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -44,7 +44,7 @@ jobs:
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
    - name: Test with pytest
      run: |
-        pytest -vv --cov=lm_eval/ tests/
-    - name: Upload to codecov
-      run: |
-        bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
+        pytest -vv tests/evaluator.py
+#    - name: Upload to codecov
+#      run: |
+#        bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -14,58 +14,70 @@ import pytest
 # TODO: more fine grained unit tests rather than this big honking integration
 # test once we break evaluator into smaller, more manageable pieces

-# @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
-def test_evaluator():
-    TASK = ["arc_easy"]
-    LIMIT = 10
+
+@pytest.mark.parametrize(
+    ("task_name,limit,model,model_args"),
+    [
+        (
+            ["arc_easy"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
+        )
+    ],
+)
+def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str):
+    task_name = task_name
+    limit = 10
+    model, model_args = model, model_args
    # task_dict = tasks.get_task_dict(task)

    # TODO: re-add cachingLM
    # os.system("rm test_cache.db")
    # lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
-    lm = registry.get_model("dummy")()
-
-    def ll_fn(reqs):
-        for ctx, cont in [req.args for req in reqs]:
-            if len(ctx) == 0:
-                continue
-            # space convention
-            assert ctx[-1] != " "
-            assert cont[0] == " " or ctx[-1] == "\n"
-
-        res = []
-
-        random.seed(42)
-        for _ in reqs:
-            res.append((-random.random(), False))
-
-        return res
-
-    def ll_perp_fn(reqs):
-        for (string,) in reqs:
-            assert isinstance(string, str)
-
-        res = []
-        random.seed(42)
-        for _ in reqs:
-            res.append(-random.random())
-
-        return res
-
-    lm.loglikelihood = ll_fn
-    lm.loglikelihood_rolling = ll_perp_fn
+    # lm = registry.get_model("dummy")()
+
+    # def ll_fn(reqs):
+    #     for ctx, cont in [req.args for req in reqs]:
+    #         if len(ctx) == 0:
+    #             continue
+    #         # space convention
+    #         assert ctx[-1] != " "
+    #         assert cont[0] == " " or ctx[-1] == "\n"
+    #
+    #     res = []
+    #
+    #     random.seed(42)
+    #     for _ in reqs:
+    #         res.append((-random.random(), False))
+    #
+    #     return res
+    #
+    # def ll_perp_fn(reqs):
+    #     for (string,) in reqs:
+    #         assert isinstance(string, str)
+    #
+    #     res = []
+    #     random.seed(42)
+    #     for _ in reqs:
+    #         res.append(-random.random())
+    #
+    #     return res
+    #
+    # lm.loglikelihood = ll_fn
+    # lm.loglikelihood_rolling = ll_perp_fn

    e1 = evaluator.simple_evaluate(
-        model="dummy",
-        tasks=TASK,
-        limit=LIMIT,
-        bootstrap_iters=10,
+        model=model,
+        tasks=task_name,
+        limit=limit,
+        model_args=model_args,
    )
    e2 = evaluator.simple_evaluate(
-        model="dummy",
-        tasks=TASK,
-        limit=LIMIT,
-        bootstrap_iters=10,
+        model=model,
+        tasks=task_name,
+        limit=limit,
+        model_args=model_args,
    )

    # check that caching is working