consolidated tasks testing

6cc1ae7c · baberabb · 2e747c5b · 6cc1ae7c · 2e747c5b · 2e747c5b
Commit 6cc1ae7c authored Sep 05, 2023 by baberabb
6 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -3,10 +3,10 @@ name: Tasks Modified
 on:
  push:
    branches:
-      - big-refactor
+      - 'big-refactor*'
  pull_request:
    branches:
-      - big-refactor
+      - 'big-refactor*'
  workflow_dispatch:
 # comment/edit out the above to stop/change the triggers
 jobs:
@@ -18,7 +18,7 @@ jobs:
      - name: checkout
        uses: actions/checkout@v3
        with:
-          fetch-depth: 0  # OR "2" -> To retrieve the preceding commit.
+          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.

      # Uses the tj-actions/changed-files@v37 action to check for changes.
      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
@@ -63,10 +63,10 @@ jobs:
      - name: Test with pytest
        # if new tasks are added, run tests on them
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
+        run: python -m pytest tests/test_tasks.py -s -vv -n=auto
        # if api is modified, run tests on it
      - name: Test more tasks with pytest
        env:
          API: true
        if: steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
+        run: python -m pytest tests/test_tasks.py -s -vv -n=auto
--- a/tests/extra/__init__.py
+++ b/tests/extra/__init__.py
--- a/tests/extra/test_new_tasks.py
+++ b/tests/extra/test_new_tasks.py
-import pytest
-from itertools import islice
-import lm_eval.tasks as tasks
-from .utilities_testing import load_changed_files, parser
-from typing import List
-from lm_eval.api.task import ConfigurableTask
-import os
-
-
-# GitHub CI
-def new_tasks() -> List[str]:
-    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
-    if os.path.exists(FILENAME):
-        # If tasks folder has changed then we get the list of files from FILENAME
-        # and parse the yaml files to get the task names.
-        return parser(load_changed_files(FILENAME))
-    elif os.getenv("API") is not None:
-        # Or if API has changed then we set the ENV variable API to True
-        # and run  given tasks.
-        return ["arc_easy", "hellaswag", "piqa", "wikitext"]
-    # if both not true just do arc_easy
-    else:
-        return ["arc_easy"]
-
-
-def get_task_class() -> List[ConfigurableTask]:
-    task_name = new_tasks()
-    x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
-    return x
-
-
-@pytest.fixture()
-def limit() -> int:
-    return 10
-
-
-# Tests
-@pytest.mark.parametrize("task_class", get_task_class())
-class TestNewTasks:
-    def test_download(self, task_class: ConfigurableTask):
-        task_class().download()
-        assert task_class().dataset is not None
-
-    def test_has_training_docs(self, task_class: ConfigurableTask):
-        assert task_class().has_training_docs() in [True, False]
-
-    def test_check_training_docs(self, task_class: ConfigurableTask):
-        task = task_class()
-        if task.has_training_docs():
-            assert task._config["training_split"] is not None
-
-    def test_has_validation_docs(self, task_class):
-        assert task_class().has_validation_docs() in [True, False]
-
-    def test_check_validation_docs(self, task_class):
-        task = task_class()
-        if task.has_validation_docs():
-            assert task._config["validation_split"] is not None
-
-    def test_has_test_docs(self, task_class):
-        assert task_class().has_test_docs() in [True, False]
-
-    def test_check_test_docs(self, task_class):
-        task = task_class()
-        if task.has_test_docs():
-            assert task._config["test_split"] is not None
-
-    def test_should_decontaminate(self, task_class):
-        task = task_class()
-        assert task.should_decontaminate() in [True, False]
-        if task.should_decontaminate():
-            assert task._config["doc_to_decontamination_query"] is not None
-
-    def test_doc_to_text(self, task_class, limit):
-        task = task_class()
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        _array = [task.doc_to_text(doc) for doc in arr]
-        # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
-        assert all(
-            isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
-            for x in _array
-        )
-
-    def test_create_choices(self, task_class, limit):
-        task = task_class()
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        if "multiple_choice" in task._config.output_type:
-            _array = [task.doc_to_choice(doc) for doc in arr]
-            # assert all(len(x) == 4 for x in _array)
-            assert all(isinstance(x, list) for x in _array)
-            assert all(isinstance(x[0], str) for x in _array)
-
-    def test_doc_to_target(self, task_class, limit):
-        task = task_class()
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        _array_target = [task.doc_to_target(doc) for doc in arr]
-        if task._config.output_type == "multiple_choice":
-            assert all(isinstance(label, int) for label in _array_target)
-        # _array_text = [task.doc_to_text(doc) for doc in arr]
-        # Not working
-        # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
-
-    def test_build_all_requests(self, task_class, limit):
-        task_class().build_all_requests(rank=1, limit=limit, world_size=1)
-        assert task_class.instances is not None
-
-    # ToDO: Add proper testing
-    def test_construct_requests(self, task_class, limit):
-        task = task_class()
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-        # assert all(isinstance(doc, list) for doc in requests)
-        assert len(requests) == limit if limit else True
--- a/tests/extra/test_utils.py
+++ b/tests/extra/test_utils.py
-import json
-from typing import List
-from lm_eval.utils import load_yaml_config
-from pathlib import Path
-
-
-FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
-
-
-def load_changed_files(file_path: str = FILE_PATH) -> List[str]:
-    with open(file_path, "r") as f:
-        return [l for line in f.readlines() for l in line.strip().split(" ")]
-
-
-def parser(full_path: List[str]) -> List[str]:
-    _output = set()
-    for x in full_path:
-        if x.endswith(".yaml"):
-            _output.add(load_yaml_config(x)["task"])
-        elif x.endswith(".py"):
-            path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
-            _output |= {load_yaml_config(x)["task"] for x in path}
-    return list(_output)
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
 from itertools import islice
 import pytest
-from typing import List
+from .utils import new_tasks
 import lm_eval.tasks as tasks
 from lm_eval.api.task import ConfigurableTask

-# Using fixtures to get the task class and limit
-@pytest.fixture()
-def task_class() -> ConfigurableTask:
-    task_name = ["arc_easy"]
-    x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
-    return x[0]
+
+# Default Task
+TASKS = ["arc_easy"]
+
+
+def task_class():
+    global TASKS
+    # CI: new_tasks checks if any modifications have been made
+    task_classes = new_tasks()
+    # Check if task_classes is empty
+    if task_classes:
+        return [tasks.TASK_REGISTRY.get(x)() for x in task_classes]
+    else:
+        return [tasks.TASK_REGISTRY.get(x)() for x in TASKS]


 @pytest.fixture()
@@ -18,109 +26,96 @@ def limit() -> int:


 # Tests
-
-
-def test_download(task_class: ConfigurableTask):
-    task_class().download()
-    assert task_class().dataset is not None
-
-
-def test_has_training_docs(task_class: ConfigurableTask):
-    assert task_class().has_training_docs() in [True, False]
-
-
-def test_check_training_docs(task_class: ConfigurableTask):
-    task = task_class()
-    if task.has_training_docs():
-        assert task._config["training_split"] is not None
-
-
-def test_has_validation_docs(task_class):
-    assert task_class().has_validation_docs() in [True, False]
-
-
-def test_check_validation_docs(task_class):
-    task = task_class()
-    if task.has_validation_docs():
-        assert task._config["validation_split"] is not None
-
-
-def test_has_test_docs(task_class):
-    assert task_class().has_test_docs() in [True, False]
-
-
-def test_check_test_docs(task_class):
-    task = task_class()
-    if task.has_test_docs():
-        assert task._config["test_split"] is not None
-
-
-def test_should_decontaminate(task_class):
-    task = task_class()
-    assert task.should_decontaminate() in [True, False]
-    if task.should_decontaminate():
-        assert task._config["doc_to_decontamination_query"] is not None
-
-
-def test_doc_to_text(task_class, limit):
-    task = task_class()
-    arr = (
-        list(islice(task.test_docs(), limit))
-        if task.has_test_docs()
-        else list(islice(task.validation_docs(), limit))
-    )
-    _array = [task.doc_to_text(doc) for doc in arr]
-    # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
-    assert all(
-        isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
-    )
-
-
-def test_create_choices(task_class, limit):
-    task = task_class()
-    arr = (
-        list(islice(task.test_docs(), limit))
-        if task.has_test_docs()
-        else list(islice(task.validation_docs(), limit))
-    )
-    if "multiple_choice" in task._config.output_type:
-        _array = [task.doc_to_choice(doc) for doc in arr]
-        # assert all(len(x) == 4 for x in _array)
-        assert all(isinstance(x, list) for x in _array)
-        assert all(isinstance(x[0], str) for x in _array)
-
-
-def test_doc_to_target(task_class, limit):
-    task = task_class()
-    arr = (
-        list(islice(task.test_docs(), limit))
-        if task.has_test_docs()
-        else list(islice(task.validation_docs(), limit))
-    )
-    _array_target = [task.doc_to_target(doc) for doc in arr]
-    if task._config.output_type == "multiple_choice":
-        assert all(isinstance(label, int) for label in _array_target)
-    # _array_text = [task.doc_to_text(doc) for doc in arr]
-    # Not working
-    # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
-
-
-def test_build_all_requests(task_class, limit):
-    task_class().build_all_requests(rank=1, limit=limit, world_size=1)
-    assert task_class.instances is not None
-
-
-# ToDO: Add proper testing
-def test_construct_requests(task_class, limit):
-    task = task_class()
-    arr = (
-        list(islice(task.test_docs(), limit))
-        if task.has_test_docs()
-        else list(islice(task.validation_docs(), limit))
-    )
-    requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-    # assert all(isinstance(doc, list) for doc in requests)
-    assert len(requests) == limit if limit else True
+@pytest.mark.parametrize("task_class", task_class())
+class TestNewTasks:
+    def test_download(self, task_class: ConfigurableTask):
+        task_class.download()
+        assert task_class.dataset is not None
+
+    def test_has_training_docs(self, task_class: ConfigurableTask):
+        assert task_class.has_training_docs() in [True, False]
+
+    def test_check_training_docs(self, task_class: ConfigurableTask):
+        if task_class.has_training_docs():
+            assert task_class._config["training_split"] is not None
+
+    def test_has_validation_docs(self, task_class):
+        assert task_class.has_validation_docs() in [True, False]
+
+    def test_check_validation_docs(self, task_class):
+        if task_class.has_validation_docs():
+            assert task_class._config["validation_split"] is not None
+
+    def test_has_test_docs(self, task_class):
+        assert task_class.has_test_docs() in [True, False]
+
+    def test_check_test_docs(self, task_class):
+        task = task_class
+        if task.has_test_docs():
+            assert task._config["test_split"] is not None
+
+    def test_should_decontaminate(self, task_class):
+        task = task_class
+        assert task.should_decontaminate() in [True, False]
+        if task.should_decontaminate():
+            assert task._config["doc_to_decontamination_query"] is not None
+
+    def test_doc_to_text(self, task_class, limit):
+        task = task_class
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        _array = [task.doc_to_text(doc) for doc in arr]
+        # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
+        assert all(
+            isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
+            for x in _array
+        )
+
+    def test_create_choices(self, task_class, limit):
+        task = task_class
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        if "multiple_choice" in task._config.output_type:
+            _array = [task.doc_to_choice(doc) for doc in arr]
+            # assert all(len(x) == 4 for x in _array)
+            assert all(isinstance(x, list) for x in _array)
+            assert all(isinstance(x[0], str) for x in _array)
+
+    def test_doc_to_target(self, task_class, limit):
+        task = task_class
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        _array_target = [task.doc_to_target(doc) for doc in arr]
+        if task._config.output_type == "multiple_choice":
+            assert all(isinstance(label, int) for label in _array_target)
+        # _array_text = [task.doc_to_text(doc) for doc in arr]
+        # Not working
+        # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
+
+    def test_build_all_requests(self, task_class, limit):
+        task_class.build_all_requests(rank=1, limit=limit, world_size=1)
+        assert task_class.instances is not None
+
+    # ToDO: Add proper testing
+    def test_construct_requests(self, task_class, limit):
+        task = task_class
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
+        # assert all(isinstance(doc, list) for doc in requests)
+        assert len(requests) == limit if limit else True


 # def test_create_choices(task_class):

--- a/tests/extra/utilities_testing.py
+++ b/tests/extra/utilities_testing.py
-import json
 from typing import List
 from lm_eval.utils import load_yaml_config
 from pathlib import Path
-import sys
+from typing import Union
+import os

+
+# {{{CI}}}
 # This is the path where the output for the changed files for the tasks folder is stored
 # FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"

-
 # reads a text file and returns a list of words
 # used to read the output of the changed txt from tj-actions/changed-files
 def load_changed_files(file_path: str) -> List[str]:
    with open(file_path, "r") as f:
        content = f.read()
        words_list = [x for x in content.split()]
-        sys.stdout.write(f"list of files: {words_list}")
    return words_list


@@ -30,3 +30,18 @@ def parser(full_path: List[str]) -> List[str]:
            path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
            _output |= {load_yaml_config(x)["task"] for x in path}
    return list(_output)
+
+
+def new_tasks() -> Union[list[str], None]:
+    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
+    if os.path.exists(FILENAME):
+        # If tasks folder has changed then we get the list of files from FILENAME
+        # and parse the yaml files to get the task names.
+        return parser(load_changed_files(FILENAME))
+    elif os.getenv("API") is not None:
+        # Or if API has changed then we set the ENV variable API to True
+        # and run  given tasks.
+        return ["arc_easy", "hellaswag", "piqa", "wikitext"]
+    # if both not true just do arc_easy
+    else:
+        return