new_tasks have own tests now

3d0282cf · baberabb · a199314a · 3d0282cf · 3d0282cf · 3d0282cf
Commit 3d0282cf authored Jul 21, 2023 by baberabb
4 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -4,10 +4,11 @@ on:
  push:
  pull_request:
  workflow_dispatch:
-
+# comment/edit out the above to stop/change the trigger
 jobs:
  changed_files:
    runs-on: ubuntu-latest  # windows-latest || macos-latest
+    timeout-minutes: 120
    name: Scan for changed tasks
    steps:
      - name: checkout
@@ -17,8 +18,8 @@ jobs:

      # Uses the tj-actions/changed-files@v37 action to check for changes.
      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
-      # The `files_yaml` input optionally takes a yaml string to specify which files to check,
-      # and prepends the name to the standard output names.
+      # The `files_yaml` input optionally takes a yaml string to specify filters,
+      # and prepends the filter name to the standard output names.
      - name: Check task folders
        id: changed-tasks
        uses: tj-actions/changed-files@v37.1.2
@@ -31,14 +32,15 @@ jobs:
              - lm_eval/api/**
          write_output_files: true

-        # This is a workaround to get the list of all modified files and save it to an env variable.
-        # The next two echo statements are just for logging.
+    # The next step is optional; the files are written to the workspace by default (above).
+    # so it's just for debugging
      - name: Run Tests
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        run: |
          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
          echo "One or more test file(s) has changed."
          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
+
      - name: Set up Python 3.9
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        uses: actions/setup-python@v4
@@ -53,7 +55,7 @@ jobs:
    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
      - name: Test with pytest
-        # if new tasks are added, run tests on them; pytest file=test_new_tasks
+        # if new tasks are added, run tests on them
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
        # if api is modified, run tests on it

--- a/tests/extra/test_new_tasks.py
+++ b/tests/extra/test_new_tasks.py
@@ -2,137 +2,126 @@ import pytest
 from itertools import islice
 import lm_eval.tasks as tasks
 from .utilities_testing import load_changed_files, parser
-from typing import List, ClassVar
+from typing import List
 from lm_eval.api.task import ConfigurableTask
 import os


 # GitHub CI
-# If tasks folder has changed then we get the list of files from FILENAME
-# and parse the yaml files to get the task names.
-# Or if API has changed then we set the ENV variable API to True
-# and run some given extended tasks
 def new_tasks() -> List[str]:
    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
    if os.path.exists(FILENAME):
+        # If tasks folder has changed then we get the list of files from FILENAME
+        # and parse the yaml files to get the task names.
        return parser(load_changed_files(FILENAME))
    elif os.getenv("API") is not None:
+        # Or if API has changed then we set the ENV variable API to True
+        # and run  given tasks.
        return ["arc_easy", "hellaswag", "piqa", "wikitext"]
    # if both not true just do arc_easy
    else:
        return ["arc_easy"]


-@pytest.fixture(params=new_tasks())
-def task_class(request) -> ConfigurableTask:
-    task_name = request.param
-    if task_name is None:
-        task_name = "arc_easy"
-    x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name == task_name]
-    return x[0]
+def get_task_class() -> List[ConfigurableTask]:
+    task_name = new_tasks()
+    x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
+    return x


-@pytest.fixture(params=new_tasks())
-def limit(request) -> int:
-    # not used; just for consistency
-    return 100
+@pytest.fixture()
+def limit() -> int:
+    return 10


 # Tests
-def test_download(task_class: ConfigurableTask):
-    task_class().download()
-    assert task_class().dataset is not None
-
-
-def test_has_training_docs(task_class: ConfigurableTask):
-    assert task_class().has_training_docs() in [True, False]
-
-
-def test_check_training_docs(task_class: ConfigurableTask):
-    task = task_class()
-    assert task.has_training_docs() if task._config["training_split"] else True
-
-
-def test_has_validation_docs(task_class):
-    assert task_class().has_training_docs() in [True, False]
-
-
-def test_check_validation_docs(task_class):
-    task = task_class()
-    assert (
-        task_class().has_training_docs() if task._config["validation_split"] else True
-    )
-
-
-def test_has_test_docs(task_class):
-    assert task_class().has_training_docs() in [True, False]
-
-
-def test_check_test_docs(task_class):
-    task = task_class()
-    assert task_class().has_training_docs() if task._config["test_split"] else True
-
-
-def test_should_decontaminate(task_class):
-    task_class = task_class()
-    assert task_class.should_decontaminate() in [True, False]
-    if task_class.should_decontaminate():
-        assert task_class._config["doc_to_decontamination_query"] is not None
-
-
-def test_doc_to_text(task_class, limit):
-    arr = (
-        list(islice(task_class().test_docs(), limit))
-        if limit
-        else list(task_class().test_docs())
-    )
-    _array = [task_class().doc_to_text(doc) for doc in arr]
-    # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
-    assert all(
-        isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
-    )
-
-
-def test_create_choices(task_class, limit):
-    arr = (
-        list(islice(task_class().test_docs(), limit))
-        if limit
-        else list(task_class().test_docs())
-    )
-    _array = [task_class().doc_to_choice(doc) for doc in arr]
-    # assert all(len(x) == 4 for x in _array)
-    assert all(isinstance(x, list) for x in _array)
-    assert all(isinstance(x[0], str) for x in _array)
-
-
-def test_doc_to_target(task_class, limit):
-    arr = (
-        list(islice(task_class().test_docs(), limit))
-        if limit
-        else list(task_class().test_target())
-    )
-    _array_target = [task_class().doc_to_target(doc) for doc in arr]
-    assert all(isinstance(label, int) for label in _array_target)
-    assert len(_array_target) == limit if limit else True
-    # _array_text = [task.doc_to_text(doc) for doc in arr]
-    # Not working
-    # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
-
-
-def test_build_all_requests(task_class, limit):
-    task_class().build_all_requests(rank=1, limit=limit, world_size=1)
-    assert task_class.instances is not None
-
-
-def test_construct_requests(task_class, limit):
-    arr = (
-        list(islice(task_class().test_docs(), limit))
-        if limit
-        else list(task_class().test_docs())
-    )
-    requests = [
-        task_class().construct_requests(doc, task_class().doc_to_text(doc))
-        for doc in arr
-    ]
-    assert all(isinstance(doc, list) for doc in requests)
-    assert len(requests) == limit if limit else True
+@pytest.mark.parametrize("task_class", get_task_class())
+class TestNewTasks:
+    def test_download(self, task_class: ConfigurableTask):
+        task_class().download()
+        assert task_class().dataset is not None
+
+    def test_has_training_docs(self, task_class: ConfigurableTask):
+        assert task_class().has_training_docs() in [True, False]
+
+    def test_check_training_docs(self, task_class: ConfigurableTask):
+        task = task_class()
+        if task.has_training_docs():
+            assert task._config["training_split"] is not None
+
+    def test_has_validation_docs(self, task_class):
+        assert task_class().has_validation_docs() in [True, False]
+
+    def test_check_validation_docs(self, task_class):
+        task = task_class()
+        if task.has_validation_docs():
+            assert task._config["validation_split"] is not None
+
+    def test_has_test_docs(self, task_class):
+        assert task_class().has_test_docs() in [True, False]
+
+    def test_check_test_docs(self, task_class):
+        task = task_class()
+        if task.has_test_docs():
+            assert task._config["test_split"] is not None
+
+    def test_should_decontaminate(self, task_class):
+        task = task_class()
+        assert task.should_decontaminate() in [True, False]
+        if task.should_decontaminate():
+            assert task._config["doc_to_decontamination_query"] is not None
+
+    def test_doc_to_text(self, task_class, limit):
+        task = task_class()
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        _array = [task.doc_to_text(doc) for doc in arr]
+        # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
+        assert all(
+            isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
+            for x in _array
+        )
+
+    def test_create_choices(self, task_class, limit):
+        task = task_class()
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        _array = [task.doc_to_choice(doc) for doc in arr]
+        # assert all(len(x) == 4 for x in _array)
+        assert all(isinstance(x, list) for x in _array)
+        assert all(isinstance(x[0], str) for x in _array)
+
+    def test_doc_to_target(self, task_class, limit):
+        task = task_class()
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        _array_target = [task.doc_to_target(doc) for doc in arr]
+        assert all(isinstance(label, int) for label in _array_target)
+        assert len(_array_target) == limit if limit else True
+        # _array_text = [task.doc_to_text(doc) for doc in arr]
+        # Not working
+        # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
+
+    def test_build_all_requests(self, task_class, limit):
+        task_class().build_all_requests(rank=1, limit=limit, world_size=1)
+        assert task_class.instances is not None
+
+    def test_construct_requests(self, task_class, limit):
+        task = task_class()
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
+        assert all(isinstance(doc, list) for doc in requests)
+        assert len(requests) == limit if limit else True
--- a/tests/extra/utilities_testing.py
+++ b/tests/extra/utilities_testing.py
@@ -2,14 +2,20 @@ import json
 from typing import List
 from lm_eval.utils import load_yaml_config
 from pathlib import Path
+import sys

 # This is the path where the output for the changed files for the tasks folder is stored
-FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
+# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"


-def load_changed_files(file_path: str = FILE_PATH) -> List[str]:
+# reads a text file and returns a list of words
+# used to read the output of the changed txt from tj-actions/changed-files
+def load_changed_files(file_path: str) -> List[str]:
    with open(file_path, "r") as f:
-        return [line.strip() for line in f.readlines()]
+        content = f.read()
+        words_list = [x for x in content.split()]
+        sys.stdout.write(f"list of files: {words_list}")
+    return words_list


 # checks the txt file for list of changed files.

--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -14,8 +14,8 @@ def task_class(task_name: List[str] = None) -> ConfigurableTask:


 @pytest.fixture()
-def limit(any_new_tasks: bool) -> int:
-    return 100 if any_new_tasks else 10
+def limit() -> int:
+    return 10


 # Tests
@@ -32,43 +32,45 @@ def test_has_training_docs(task_class: ConfigurableTask):

 def test_check_training_docs(task_class: ConfigurableTask):
    task = task_class()
-    assert task.has_training_docs() if task._config["training_split"] else True
+    if task.has_training_docs():
+        assert task._config["training_split"] is not None


 def test_has_validation_docs(task_class):
-    assert task_class().has_training_docs() in [True, False]
+    assert task_class().has_validation_docs() in [True, False]


 def test_check_validation_docs(task_class):
    task = task_class()
-    assert (
-        task_class().has_training_docs() if task._config["validation_split"] else True
-    )
+    if task.has_validation_docs():
+        assert task._config["validation_split"] is not None


 def test_has_test_docs(task_class):
-    assert task_class().has_training_docs() in [True, False]
+    assert task_class().has_test_docs() in [True, False]


 def test_check_test_docs(task_class):
    task = task_class()
-    assert task_class().has_training_docs() if task._config["test_split"] else True
+    if task.has_test_docs():
+        assert task._config["test_split"] is not None


 def test_should_decontaminate(task_class):
-    task_class = task_class()
-    assert task_class.should_decontaminate() in [True, False]
-    if task_class.should_decontaminate():
-        assert task_class._config["doc_to_decontamination_query"] is not None
+    task = task_class()
+    assert task.should_decontaminate() in [True, False]
+    if task.should_decontaminate():
+        assert task._config["doc_to_decontamination_query"] is not None


 def test_doc_to_text(task_class, limit):
+    task = task_class()
    arr = (
-        list(islice(task_class().test_docs(), limit))
-        if limit
-        else list(task_class().test_docs())
+        list(islice(task.test_docs(), limit))
+        if task.has_test_docs()
+        else list(islice(task.validation_docs(), limit))
    )
-    _array = [task_class().doc_to_text(doc) for doc in arr]
+    _array = [task.doc_to_text(doc) for doc in arr]
    # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
    assert all(
        isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
@@ -76,24 +78,26 @@ def test_doc_to_text(task_class, limit):


 def test_create_choices(task_class, limit):
+    task = task_class()
    arr = (
-        list(islice(task_class().test_docs(), limit))
-        if limit
-        else list(task_class().test_docs())
+        list(islice(task.test_docs(), limit))
+        if task.has_test_docs()
+        else list(islice(task.validation_docs(), limit))
    )
-    _array = [task_class().doc_to_choice(doc) for doc in arr]
+    _array = [task.doc_to_choice(doc) for doc in arr]
    # assert all(len(x) == 4 for x in _array)
    assert all(isinstance(x, list) for x in _array)
    assert all(isinstance(x[0], str) for x in _array)


 def test_doc_to_target(task_class, limit):
+    task = task_class()
    arr = (
-        list(islice(task_class().test_docs(), limit))
-        if limit
-        else list(task_class().test_target())
+        list(islice(task.test_docs(), limit))
+        if task.has_test_docs()
+        else list(islice(task.validation_docs(), limit))
    )
-    _array_target = [task_class().doc_to_target(doc) for doc in arr]
+    _array_target = [task.doc_to_target(doc) for doc in arr]
    assert all(isinstance(label, int) for label in _array_target)
    assert len(_array_target) == limit if limit else True
    # _array_text = [task.doc_to_text(doc) for doc in arr]
@@ -107,15 +111,13 @@ def test_build_all_requests(task_class, limit):


 def test_construct_requests(task_class, limit):
+    task = task_class()
    arr = (
-        list(islice(task_class().test_docs(), limit))
-        if limit
-        else list(task_class().test_docs())
+        list(islice(task.test_docs(), limit))
+        if task.has_test_docs()
+        else list(islice(task.validation_docs(), limit))
    )
-    requests = [
-        task_class().construct_requests(doc, task_class().doc_to_text(doc))
-        for doc in arr
-    ]
+    requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
    assert all(isinstance(doc, list) for doc in requests)
    assert len(requests) == limit if limit else True