Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into benchmark-scripts

Merge branch 'big-refactor' of...
Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into benchmark-scripts
b7cd829b · lintangsutawika · 2d96a8c8 · 4e44f0aa · b7cd829b · b7cd829b
Commit b7cd829b authored Jul 27, 2023 by lintangsutawika
12 changed files
--- a/main.py
+++ b/main.py
@@ -132,8 +132,8 @@ def main():

    if args.output_path:
        path = Path(args.output_path)
-        # check if file or 'dir/results.jsonl' exists
-        if path.is_file() or Path(args.output_path).joinpath("results.jsonl").is_file():
+        # check if file or 'dir/results.json' exists
+        if path.is_file() or Path(args.output_path).joinpath("results.json").is_file():
            eval_logger.warning(
                f"File already exists at {path}. Results will be overwritten."
            )

--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,9 @@ setuptools.setup(
        "lm_eval": ["**/*.yaml"],
        "examples": ["**/*.yaml"],
    },
+    entry_points={
+        "console_scripts": ["lm-eval = main:main", "lm_eval = main:main"],
+    },
    include_package_data=True,
    classifiers=[
        "Development Status :: 3 - Alpha",
@@ -50,6 +53,13 @@ setuptools.setup(
    ],
    extras_require={
        "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
+        "linting": [
+            "flake8",
+            "pylint",
+            "mypy",
+            "pre-commit",
+        ],
+        "testing": ["pytest", "pytest-cov", "pytest-xdist"],
        "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
        "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
        "promptsource": [

--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/extra/__init__.py
+++ b/tests/extra/__init__.py
--- a/tests/extra/test_new_tasks.py
+++ b/tests/extra/test_new_tasks.py
+import pytest
+from itertools import islice
+import lm_eval.tasks as tasks
+from .utilities_testing import load_changed_files, parser
+from typing import List
+from lm_eval.api.task import ConfigurableTask
+import os
+
+
+# GitHub CI
+def new_tasks() -> List[str]:
+    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
+    if os.path.exists(FILENAME):
+        # If tasks folder has changed then we get the list of files from FILENAME
+        # and parse the yaml files to get the task names.
+        return parser(load_changed_files(FILENAME))
+    elif os.getenv("API") is not None:
+        # Or if API has changed then we set the ENV variable API to True
+        # and run  given tasks.
+        return ["arc_easy", "hellaswag", "piqa", "wikitext"]
+    # if both not true just do arc_easy
+    else:
+        return ["arc_easy"]
+
+
+def get_task_class() -> List[ConfigurableTask]:
+    task_name = new_tasks()
+    x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
+    return x
+
+
+@pytest.fixture()
+def limit() -> int:
+    return 10
+
+
+# Tests
+@pytest.mark.parametrize("task_class", get_task_class())
+class TestNewTasks:
+    def test_download(self, task_class: ConfigurableTask):
+        task_class().download()
+        assert task_class().dataset is not None
+
+    def test_has_training_docs(self, task_class: ConfigurableTask):
+        assert task_class().has_training_docs() in [True, False]
+
+    def test_check_training_docs(self, task_class: ConfigurableTask):
+        task = task_class()
+        if task.has_training_docs():
+            assert task._config["training_split"] is not None
+
+    def test_has_validation_docs(self, task_class):
+        assert task_class().has_validation_docs() in [True, False]
+
+    def test_check_validation_docs(self, task_class):
+        task = task_class()
+        if task.has_validation_docs():
+            assert task._config["validation_split"] is not None
+
+    def test_has_test_docs(self, task_class):
+        assert task_class().has_test_docs() in [True, False]
+
+    def test_check_test_docs(self, task_class):
+        task = task_class()
+        if task.has_test_docs():
+            assert task._config["test_split"] is not None
+
+    def test_should_decontaminate(self, task_class):
+        task = task_class()
+        assert task.should_decontaminate() in [True, False]
+        if task.should_decontaminate():
+            assert task._config["doc_to_decontamination_query"] is not None
+
+    def test_doc_to_text(self, task_class, limit):
+        task = task_class()
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        _array = [task.doc_to_text(doc) for doc in arr]
+        # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
+        assert all(
+            isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
+            for x in _array
+        )
+
+    def test_create_choices(self, task_class, limit):
+        task = task_class()
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        if "multiple_choice" in task._config.group:
+            _array = [task.doc_to_choice(doc) for doc in arr]
+            # assert all(len(x) == 4 for x in _array)
+            assert all(isinstance(x, list) for x in _array)
+            assert all(isinstance(x[0], str) for x in _array)
+
+    def test_doc_to_target(self, task_class, limit):
+        task = task_class()
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        _array_target = [task.doc_to_target(doc) for doc in arr]
+        assert all(isinstance(label, int) for label in _array_target)
+        assert len(_array_target) == limit if limit else True
+        # _array_text = [task.doc_to_text(doc) for doc in arr]
+        # Not working
+        # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
+
+    def test_build_all_requests(self, task_class, limit):
+        task_class().build_all_requests(rank=1, limit=limit, world_size=1)
+        assert task_class.instances is not None
+
+    def test_construct_requests(self, task_class, limit):
+        task = task_class()
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
+        assert all(isinstance(doc, list) for doc in requests)
+        assert len(requests) == limit if limit else True
--- a/tests/extra/utilities_testing.py
+++ b/tests/extra/utilities_testing.py
+import json
+from typing import List
+from lm_eval.utils import load_yaml_config
+from pathlib import Path
+import sys
+
+# This is the path where the output for the changed files for the tasks folder is stored
+# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
+
+
+# reads a text file and returns a list of words
+# used to read the output of the changed txt from tj-actions/changed-files
+def load_changed_files(file_path: str) -> List[str]:
+    with open(file_path, "r") as f:
+        content = f.read()
+        words_list = [x for x in content.split()]
+        sys.stdout.write(f"list of files: {words_list}")
+    return words_list
+
+
+# checks the txt file for list of changed files.
+# if file ends with .yaml then check yaml for task name
+# if file ends with .py then parse the folder for all yaml files
+def parser(full_path: List[str]) -> List[str]:
+    _output = set()
+    for x in full_path:
+        if x.endswith(".yaml"):
+            _output.add(load_yaml_config(x)["task"])
+        elif x.endswith(".py"):
+            path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
+            _output |= {load_yaml_config(x)["task"] for x in path}
+    return list(_output)
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -5,7 +5,7 @@ import lm_eval.api.registry as registry
 import lm_eval.tasks as tasks

 # import lm_eval.models as models
-
+import lm_eval.api as api
 import lm_eval.evaluator as evaluator
 import random
 import pytest
@@ -15,60 +15,52 @@ import pytest
 # test once we break evaluator into smaller, more manageable pieces


-@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
-def test_evaluator(taskname, task_class):
-    task_dict = tasks.get_task_dict([taskname])
-
-    # TODO: re-add cachingLM
-    # os.system("rm test_cache.db")
-    # lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
-    lm = registry.get_model("dummy")()
-
-    def ll_fn(reqs):
-        for ctx, cont in reqs:
-            if len(ctx) == 0:
-                continue
-            # space convention
-            assert ctx[-1] != " "
-            assert cont[0] == " " or ctx[-1] == "\n"
-
-        res = []
-
-        random.seed(42)
-        for _ in reqs:
-            res.append((-random.random(), False))
-
-        return res
-
-    def ll_perp_fn(reqs):
-        for (string,) in reqs:
-            assert isinstance(string, str)
-
-        res = []
-        random.seed(42)
-        for _ in reqs:
-            res.append(-random.random())
-
-        return res
-
-    lm.loglikelihood = ll_fn
-    lm.loglikelihood_rolling = ll_perp_fn
-
+@pytest.mark.parametrize(
+    "task_name,limit,model,model_args",
+    [
+        (
+            ["arc_easy"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
+        )
+    ],
+)
+def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str):
+    task_name = task_name
    limit = 10
-    e1 = evaluator.evaluate(
-        lm=lm,
-        task_dict=task_dict,
-        num_fewshot=0,
+
+    e1 = evaluator.simple_evaluate(
+        model=model,
+        tasks=task_name,
        limit=limit,
-        bootstrap_iters=10,
+        model_args=model_args,
+    )
+    assert e1 is not None
+
+    lm = api.registry.get_model(model).create_from_arg_string(
+        model_args,
+        {
+            "batch_size": None,
+            "max_batch_size": None,
+            "device": None,
+        },
    )
+    task_dict = tasks.get_task_dict(task_name, num_fewshot=0)
+
    e2 = evaluator.evaluate(
        lm=lm,
        task_dict=task_dict,
-        num_fewshot=0,
        limit=limit,
-        bootstrap_iters=10,
    )

+    assert e2 is not None
    # check that caching is working
-    assert e1 == e2
+
+    def r(x):
+        return x["results"]["arc_easy"]
+
+    assert all(
+        x == y
+        for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
+    )
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
-import lm_eval.tasks as tasks
-
-import pytest
 from itertools import islice
+import pytest
+from typing import List
+import lm_eval.tasks as tasks
+from lm_eval.api.task import ConfigurableTask

+# Using fixtures to get the task class and limit
+@pytest.fixture()
+def task_class() -> ConfigurableTask:
+    task_name = ["arc_easy"]
+    x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
+    return x[0]

-@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
-def test_basic_interface(taskname, task_class):
-    print("Evaluating task", taskname)
-    task = task_class()
-
-    assert task.has_training_docs() in [True, False]
-    assert task.has_validation_docs() in [True, False]
-    assert task.has_test_docs() in [True, False]
-
-    assert isinstance(task.aggregation(), dict)
-    assert isinstance(task.higher_is_better(), dict)
-    assert task.aggregation().keys() == task.higher_is_better().keys()

-    for v in task.higher_is_better().values():
-        assert v in [True, False]
+@pytest.fixture()
+def limit() -> int:
+    return 10

-    assert isinstance(task.VERSION, int)

-    # test deterministic docs
-    # (don't test train because it's slow)
+# Tests

-    task2 = task_class()

-    limit = None
+def test_download(task_class: ConfigurableTask):
+    task_class().download()
+    assert task_class().dataset is not None

-    if taskname in ["triviaqa"] or taskname.startswith("pile_"):
-        limit = 10000
-    if task.has_validation_docs():
-        arr = list(islice(task.validation_docs(), limit))
-        arr2 = list(islice(task2.validation_docs(), limit))

-        assert arr == arr2
+def test_has_training_docs(task_class: ConfigurableTask):
+    assert task_class().has_training_docs() in [True, False]

-        reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-        reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]

-        assert reqs == reqs2
+def test_check_training_docs(task_class: ConfigurableTask):
+    task = task_class()
+    if task.has_training_docs():
+        assert task._config["training_split"] is not None

-    if task.has_test_docs():
-        arr = list(islice(task.test_docs(), limit))
-        arr2 = list(islice(task2.test_docs(), limit))

-        assert arr == arr2
+def test_has_validation_docs(task_class):
+    assert task_class().has_validation_docs() in [True, False]

-        reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-        reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]

-        assert reqs == reqs2
+def test_check_validation_docs(task_class):
+    task = task_class()
+    if task.has_validation_docs():
+        assert task._config["validation_split"] is not None

-    if task.has_training_docs():
-        arr = list(islice(task.training_docs(), limit))
-        arr2 = list(islice(task2.training_docs(), limit))

-        assert arr == arr2
+def test_has_test_docs(task_class):
+    assert task_class().has_test_docs() in [True, False]

-        reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-        reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]

-        assert reqs == reqs2
+def test_check_test_docs(task_class):
+    task = task_class()
+    if task.has_test_docs():
+        assert task._config["test_split"] is not None


-@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
-def test_documents_and_requests(taskname, task_class):
-    print("Evaluating task", taskname)
+def test_should_decontaminate(task_class):
    task = task_class()
-    fns = []
-    if task.has_training_docs():
-        fns.append(task.training_docs)
-    if task.has_validation_docs():
-        fns.append(task.validation_docs)
-    # test doc might not have labels
-    # if task.has_test_docs(): fns.append(task.test_docs)
-
-    for fn in fns:
-        # print(list(islice(fn(), 10)))
-        for doc in islice(fn(), 10):
+    assert task.should_decontaminate() in [True, False]
+    if task.should_decontaminate():
+        assert task._config["doc_to_decontamination_query"] is not None

-            txt = task.doc_to_text(doc)
-            tgt = task.doc_to_target(doc)

-            assert isinstance(txt, str)
-            assert isinstance(tgt, str)
+def test_doc_to_text(task_class, limit):
+    task = task_class()
+    arr = (
+        list(islice(task.test_docs(), limit))
+        if task.has_test_docs()
+        else list(islice(task.validation_docs(), limit))
+    )
+    _array = [task.doc_to_text(doc) for doc in arr]
+    # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
+    assert all(
+        isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
+    )
+
+
+def test_create_choices(task_class, limit):
+    task = task_class()
+    arr = (
+        list(islice(task.test_docs(), limit))
+        if task.has_test_docs()
+        else list(islice(task.validation_docs(), limit))
+    )
+    if "multiple_choice" in task._config.group:
+        _array = [task.doc_to_choice(doc) for doc in arr]
+        # assert all(len(x) == 4 for x in _array)
+        assert all(isinstance(x, list) for x in _array)
+        assert all(isinstance(x[0], str) for x in _array)
+
+
+def test_doc_to_target(task_class, limit):
+    task = task_class()
+    arr = (
+        list(islice(task.test_docs(), limit))
+        if task.has_test_docs()
+        else list(islice(task.validation_docs(), limit))
+    )
+    _array_target = [task.doc_to_target(doc) for doc in arr]
+    assert all(isinstance(label, int) for label in _array_target)
+    assert len(_array_target) == limit if limit else True
+    # _array_text = [task.doc_to_text(doc) for doc in arr]
+    # Not working
+    # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))

-            # space convention
-            # allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
-            if len(txt) != 0:
-                assert txt[-1] != " "
-                assert tgt[0] == " " or txt[-1] == "\n"

-            reqs = task.construct_requests(doc, txt)
+def test_build_all_requests(task_class, limit):
+    task_class().build_all_requests(rank=1, limit=limit, world_size=1)
+    assert task_class.instances is not None

-            # construct_requests can return just one request
-            if not isinstance(reqs, (list, tuple)):
-                reqs = [reqs]

-            # todo: mock lm after refactoring evaluator.py to not be a mess
-            # for req in reqs:
-            #     assert isinstance(req, base.Request)
+def test_construct_requests(task_class, limit):
+    task = task_class()
+    arr = (
+        list(islice(task.test_docs(), limit))
+        if task.has_test_docs()
+        else list(islice(task.validation_docs(), limit))
+    )
+    requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
+    assert all(isinstance(doc, list) for doc in requests)
+    assert len(requests) == limit if limit else True
+
+
+# def test_create_choices(task_class):
+#     arr = list(islice(task_class().test_docs(), 1))
+#     choices = task_class().create_choices(arr[0])
+#     assert choices is not None
+# checking if number of choices is correct
+
+
+# @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
+# def test_basic_interface(taskname, task_class):
+#     print("Evaluating task", taskname)
+#     task = task_class()
+#
+#     assert task.has_training_docs() in [True, False]
+#     assert task.has_validation_docs() in [True, False]
+#     assert task.has_test_docs() in [True, False]
+#
+#     assert isinstance(task.aggregation(), dict)
+#     assert isinstance(task.higher_is_better(), dict)
+#     assert task.aggregation().keys() == task.higher_is_better().keys()
+#
+#     for v in task.higher_is_better().values():
+#         assert v in [True, False]
+#
+#     assert isinstance(task.VERSION, int)
+#
+#     # test deterministic docs
+#     # (don't test train because it's slow)
+#
+#     task2 = task_class()
+#
+#     limit = None
+#
+#     if taskname in ["triviaqa"] or taskname.startswith("pile_"):
+#         limit = 10000
+#     if task.has_validation_docs():
+#         arr = list(islice(task.validation_docs(), limit))
+#         arr2 = list(islice(task2.validation_docs(), limit))
+#
+#         assert arr == arr2
+#
+#         reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
+#         reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
+#
+#         assert reqs == reqs2
+#
+#     if task.has_test_docs():
+#         arr = list(islice(task.test_docs(), limit))
+#         arr2 = list(islice(task2.test_docs(), limit))
+#
+#         assert arr == arr2
+#
+#         reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
+#         reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
+#
+#         assert reqs == reqs2
+#
+#     if task.has_training_docs():
+#         arr = list(islice(task.training_docs(), limit))
+#         arr2 = list(islice(task2.training_docs(), limit))
+#
+#         assert arr == arr2
+#
+#         reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
+#         reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
+#
+#         assert reqs == reqs2
+#
+#
+# @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
+# def test_documents_and_requests(taskname, task_class):
+#     print("Evaluating task", taskname)
+#     task = task_class()
+#     fns = []
+#     if task.has_training_docs():
+#         fns.append(task.training_docs)
+#     if task.has_validation_docs():
+#         fns.append(task.validation_docs)
+#     # test doc might not have labels
+#     # if task.has_test_docs(): fns.append(task.test_docs)
+#
+#     for fn in fns:
+#         # print(list(islice(fn(), 10)))
+#         for doc in islice(fn(), 10):
+#
+#             txt = task.doc_to_text(doc)
+#             tgt = task.doc_to_target(doc)
+#
+#             assert isinstance(txt, str)
+#             assert isinstance(tgt, str)
+#
+#             # space convention
+#             # allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
+#             if len(txt) != 0:
+#                 assert txt[-1] != " "
+#                 assert tgt[0] == " " or txt[-1] == "\n"
+#
+#             reqs = task.construct_requests(doc, txt)
+#
+#             # construct_requests can return just one request
+#             if not isinstance(reqs, (list, tuple)):
+#                 reqs = [reqs]
+#
+#             # todo: mock lm after refactoring evaluator.py to not be a mess
+#             # for req in reqs:
+#             #     assert isinstance(req, base.Request)
--- a/tests/test_description.py
+++ b/tests/test_description.py
@@ -6,7 +6,7 @@ import lm_eval.models
 def test_description():
    seed = 42
    num_examples = 1
-    task_names = ["arc_challenge", "lambada"]
+    task_names = ["arc_challenge", "arc_easy"]
    description_dict = {
        "arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
        "lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
@@ -40,6 +40,5 @@ def test_description():
            ctx = task.fewshot_context(
                doc=doc,
                num_fewshot=1,
-                rnd=rnd,
            )
            assert description in ctx
--- a/tests/test_generate_13_grams.py
+++ b/tests/test_generate_13_grams.py
@@ -44,9 +44,9 @@ def test_generate_13_grams_1(caplog):
        pass
    os.makedirs(test_working_directory)

-    assert not os.path.exists("pile")
-    os.makedirs("pile")
-    archive = Archive(os.path.join("pile", "test.jsonl.zst"))
+    assert not os.path.exists("../pile")
+    os.makedirs("../pile")
+    archive = Archive(os.path.join("../pile", "test.jsonl.zst"))
    archive.add_data(data)
    archive.commit()


--- a/tests/test_models.py
+++ b/tests/test_models.py
--- a/tests/test_version_stable.py
+++ b/tests/test_version_stable.py