merge conflicts

16bc6bc0 · haileyschoelkopf · 3d7f777d · 465c695b · 16bc6bc0 · 16bc6bc0
Commit 16bc6bc0 authored Aug 01, 2023 by haileyschoelkopf
6 changed files
--- a/main.py
+++ b/main.py
@@ -28,7 +28,7 @@ def parse_args():
    parser.add_argument(
        "--num_fewshot",
        type=int,
-        default=0,
+        default=None,
        help="Number of examples in few-shot context",
    )
    parser.add_argument("--batch_size", type=int, default=1)  # TODO: only integers

--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,9 @@ setuptools.setup(
        "lm_eval": ["**/*.yaml"],
        "examples": ["**/*.yaml"],
    },
+    entry_points={
+        "console_scripts": ["lm-eval = main:main", "lm_eval = main:main"],
+    },
    include_package_data=True,
    classifiers=[
        "Development Status :: 3 - Alpha",

--- a/tests/conftest.py
+++ b/tests/conftest.py
-def pytest_addoption(parser):
-    parser.addoption(
-        "--new_task",
-        action="store_true",
-        help="new_tasks_found",
-    )
--- a/tests/extra/test_new_tasks.py
+++ b/tests/extra/test_new_tasks.py
+import pytest
+from itertools import islice
+import lm_eval.tasks as tasks
+from .utilities_testing import load_changed_files, parser
+from typing import List
+from lm_eval.api.task import ConfigurableTask
+import os
+
+
+# GitHub CI
+def new_tasks() -> List[str]:
+    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
+    if os.path.exists(FILENAME):
+        # If tasks folder has changed then we get the list of files from FILENAME
+        # and parse the yaml files to get the task names.
+        return parser(load_changed_files(FILENAME))
+    elif os.getenv("API") is not None:
+        # Or if API has changed then we set the ENV variable API to True
+        # and run  given tasks.
+        return ["arc_easy", "hellaswag", "piqa", "wikitext"]
+    # if both not true just do arc_easy
+    else:
+        return ["arc_easy"]
+
+
+def get_task_class() -> List[ConfigurableTask]:
+    task_name = new_tasks()
+    x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
+    return x
+
+
+@pytest.fixture()
+def limit() -> int:
+    return 10
+
+
+# Tests
+@pytest.mark.parametrize("task_class", get_task_class())
+class TestNewTasks:
+    def test_download(self, task_class: ConfigurableTask):
+        task_class().download()
+        assert task_class().dataset is not None
+
+    def test_has_training_docs(self, task_class: ConfigurableTask):
+        assert task_class().has_training_docs() in [True, False]
+
+    def test_check_training_docs(self, task_class: ConfigurableTask):
+        task = task_class()
+        if task.has_training_docs():
+            assert task._config["training_split"] is not None
+
+    def test_has_validation_docs(self, task_class):
+        assert task_class().has_validation_docs() in [True, False]
+
+    def test_check_validation_docs(self, task_class):
+        task = task_class()
+        if task.has_validation_docs():
+            assert task._config["validation_split"] is not None
+
+    def test_has_test_docs(self, task_class):
+        assert task_class().has_test_docs() in [True, False]
+
+    def test_check_test_docs(self, task_class):
+        task = task_class()
+        if task.has_test_docs():
+            assert task._config["test_split"] is not None
+
+    def test_should_decontaminate(self, task_class):
+        task = task_class()
+        assert task.should_decontaminate() in [True, False]
+        if task.should_decontaminate():
+            assert task._config["doc_to_decontamination_query"] is not None
+
+    def test_doc_to_text(self, task_class, limit):
+        task = task_class()
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        _array = [task.doc_to_text(doc) for doc in arr]
+        # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
+        assert all(
+            isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
+            for x in _array
+        )
+
+    def test_create_choices(self, task_class, limit):
+        task = task_class()
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        if "multiple_choice" in task._config.group:
+            _array = [task.doc_to_choice(doc) for doc in arr]
+            # assert all(len(x) == 4 for x in _array)
+            assert all(isinstance(x, list) for x in _array)
+            assert all(isinstance(x[0], str) for x in _array)
+
+    def test_doc_to_target(self, task_class, limit):
+        task = task_class()
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        _array_target = [task.doc_to_target(doc) for doc in arr]
+        assert all(isinstance(label, int) for label in _array_target)
+        assert len(_array_target) == limit if limit else True
+        # _array_text = [task.doc_to_text(doc) for doc in arr]
+        # Not working
+        # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
+
+    def test_build_all_requests(self, task_class, limit):
+        task_class().build_all_requests(rank=1, limit=limit, world_size=1)
+        assert task_class.instances is not None
+
+    def test_construct_requests(self, task_class, limit):
+        task = task_class()
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
+        assert all(isinstance(doc, list) for doc in requests)
+        assert len(requests) == limit if limit else True
--- a/tests/extra/utilities_testing.py
+++ b/tests/extra/utilities_testing.py
+import json
+from typing import List
+from lm_eval.utils import load_yaml_config
+from pathlib import Path
+import sys
+
+# This is the path where the output for the changed files for the tasks folder is stored
+# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
+
+
+# reads a text file and returns a list of words
+# used to read the output of the changed txt from tj-actions/changed-files
+def load_changed_files(file_path: str) -> List[str]:
+    with open(file_path, "r") as f:
+        content = f.read()
+        words_list = [x for x in content.split()]
+        sys.stdout.write(f"list of files: {words_list}")
+    return words_list
+
+
+# checks the txt file for list of changed files.
+# if file ends with .yaml then check yaml for task name
+# if file ends with .py then parse the folder for all yaml files
+def parser(full_path: List[str]) -> List[str]:
+    _output = set()
+    for x in full_path:
+        if x.endswith(".yaml"):
+            _output.add(load_yaml_config(x)["task"])
+        elif x.endswith(".py"):
+            path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
+            _output |= {load_yaml_config(x)["task"] for x in path}
+    return list(_output)
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
-import pytest
 from itertools import islice
+import pytest
+from typing import List
 import lm_eval.tasks as tasks
-from tests.extra.test_utils import load_changed_files, parser
-from typing import List, ClassVar
-import os
-
+from lm_eval.api.task import ConfigurableTask

+# Using fixtures to get the task class and limit
 @pytest.fixture()
-def any_new_tasks(request) -> bool:
-    return request.config.getoption("--new_task")
-
-
-# ["arc_easy] else get list of new tasks
-def new_tasks(any_new_tasks: bool) -> List[str]:
-    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
-    if any_new_tasks and os.path.exists(FILENAME):
-        return [parser(load_changed_files(FILENAME))]
-    elif os.getenv("API") is not None:
-        return ["arc_easy", "hellaswag", "piqa", "wikitext"]
-    else:
-        return ["arc_easy"]
-
-
-@pytest.fixture(params=new_tasks(any_new_tasks))
-def task_class(request):
-    task_name = request.param
-    return [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name][0]
+def task_class() -> ConfigurableTask:
+    task_name = ["arc_easy"]
+    x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
+    return x[0]


 @pytest.fixture()
-def limit(any_new_tasks: bool) -> int:
-    return 100 if any_new_tasks else 10
+def limit() -> int:
+    return 10


 # Tests


-def test_download(task_class):
+def test_download(task_class: ConfigurableTask):
    task_class().download()
    assert task_class().dataset is not None


-def test_has_training_docs(task_class):
+def test_has_training_docs(task_class: ConfigurableTask):
    assert task_class().has_training_docs() in [True, False]


-def test_check_training_docs(task_class):
+def test_check_training_docs(task_class: ConfigurableTask):
    task = task_class()
-    assert task.has_training_docs() if task._config["training_split"] else True
+    if task.has_training_docs():
+        assert task._config["training_split"] is not None


 def test_has_validation_docs(task_class):
-    assert task_class().has_training_docs() in [True, False]
+    assert task_class().has_validation_docs() in [True, False]


 def test_check_validation_docs(task_class):
    task = task_class()
-    assert (
-        task_class().has_training_docs() if task._config["validation_split"] else True
-    )
+    if task.has_validation_docs():
+        assert task._config["validation_split"] is not None


 def test_has_test_docs(task_class):
-    assert task_class().has_training_docs() in [True, False]
+    assert task_class().has_test_docs() in [True, False]


 def test_check_test_docs(task_class):
    task = task_class()
-    assert task_class().has_training_docs() if task._config["test_split"] else True
+    if task.has_test_docs():
+        assert task._config["test_split"] is not None


 def test_should_decontaminate(task_class):
-    task_class = task_class()
-    assert task_class.should_decontaminate() in [True, False]
-    if task_class.should_decontaminate():
-        assert task_class._config["doc_to_decontamination_query"] is not None
+    task = task_class()
+    assert task.should_decontaminate() in [True, False]
+    if task.should_decontaminate():
+        assert task._config["doc_to_decontamination_query"] is not None


 def test_doc_to_text(task_class, limit):
+    task = task_class()
    arr = (
-        list(islice(task_class().test_docs(), limit))
-        if limit
-        else list(task_class().test_docs())
+        list(islice(task.test_docs(), limit))
+        if task.has_test_docs()
+        else list(islice(task.validation_docs(), limit))
    )
-    _array = [task_class().doc_to_text(doc) for doc in arr]
+    _array = [task.doc_to_text(doc) for doc in arr]
    # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
    assert all(
        isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
@@ -91,24 +77,27 @@ def test_doc_to_text(task_class, limit):


 def test_create_choices(task_class, limit):
+    task = task_class()
    arr = (
-        list(islice(task_class().test_docs(), limit))
-        if limit
-        else list(task_class().test_docs())
+        list(islice(task.test_docs(), limit))
+        if task.has_test_docs()
+        else list(islice(task.validation_docs(), limit))
    )
-    _array = [task_class().doc_to_choice(doc) for doc in arr]
-    # assert all(len(x) == 4 for x in _array)
-    assert all(isinstance(x, list) for x in _array)
-    assert all(isinstance(x[0], str) for x in _array)
+    if "multiple_choice" in task._config.group:
+        _array = [task.doc_to_choice(doc) for doc in arr]
+        # assert all(len(x) == 4 for x in _array)
+        assert all(isinstance(x, list) for x in _array)
+        assert all(isinstance(x[0], str) for x in _array)


 def test_doc_to_target(task_class, limit):
+    task = task_class()
    arr = (
-        list(islice(task_class().test_docs(), limit))
-        if limit
-        else list(task_class().test_target())
+        list(islice(task.test_docs(), limit))
+        if task.has_test_docs()
+        else list(islice(task.validation_docs(), limit))
    )
-    _array_target = [task_class().doc_to_target(doc) for doc in arr]
+    _array_target = [task.doc_to_target(doc) for doc in arr]
    assert all(isinstance(label, int) for label in _array_target)
    assert len(_array_target) == limit if limit else True
    # _array_text = [task.doc_to_text(doc) for doc in arr]
@@ -122,15 +111,13 @@ def test_build_all_requests(task_class, limit):


 def test_construct_requests(task_class, limit):
+    task = task_class()
    arr = (
-        list(islice(task_class().test_docs(), limit))
-        if limit
-        else list(task_class().test_docs())
+        list(islice(task.test_docs(), limit))
+        if task.has_test_docs()
+        else list(islice(task.validation_docs(), limit))
    )
-    requests = [
-        task_class().construct_requests(doc, task_class().doc_to_text(doc))
-        for doc in arr
-    ]
+    requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
    assert all(isinstance(doc, list) for doc in requests)
    assert len(requests) == limit if limit else True