edited test_tasks.py and workflow files

4c250860 · baberabb · 31995161 · 4c250860 · 31995161 · 4c250860
Commit 4c250860 authored Jul 15, 2023 by baberabb
14 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
+name: Tasks Modified
+
+on:
+  push:
+    branches:
+      - big-refactor
+  pull_request:
+    branches:
+      - big-refactor
+  workflow_dispatch:
+
+jobs:
+  changed_files:
+    runs-on: ubuntu-latest  # windows-latest || macos-latest
+    name: Scan for changed tasks
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0  # OR "2" -> To retrieve the preceding commit.
+
+      # Example 1
+      - name: Check task folders
+        id: changed-tasks
+        uses: tj-actions/changed-files@v37.1.2
+        with:
+          files_yaml: |
+            tasks:
+              - lm_eval/tasks/**
+          write_output_files: true
+
+      - name: Run Tests
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+        run: |
+          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
+          echo "One or more test file(s) has changed."
+          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
+      - name: Set up Python 3.9
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+          cache: 'pip'
+      - name: Install dependencies
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+        run: |
+            python -m pip install --upgrade pip
+            pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+            #         Install optional git dependencies
+    #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+    #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      - name: Test with pytest
+        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
+        run: python -m pytest tests/test_tasks.py -s -vv -n=auto --new_task
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
-# This workflow will install Python dependencies, run tests and lint with a single version of Python
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-#name: Build
-#
-#on:
-#  push:
-#  pull_request:
-#  workflow_dispatch:
-#
-#jobs:
-#  build:
-#    runs-on: ubuntu-latest
-#    strategy:
-#      matrix:
-#        python-version: ["3.8", "3.9", "3.10"]
-#    steps:
-#    - uses: actions/checkout@v3
-#    - name: Set up Python ${{ matrix.python-version }}
-#      uses: actions/setup-python@v3
-#      with:
-#        python-version: ${{ matrix.python-version }}
-#    - name: Install dependencies
-#      run: |
-#        python -m pip install --upgrade pip
-#        pip install flake8 pytest pytest-cov
-#        pip install -e .[dev,multilingual]
-#        # Install optional git dependencies
-#        pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-#        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-#    - name: Lint with flake8
-#      run: |
-#        # stop the build if there are Python syntax errors or undefined names
-#        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-#        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-#        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-#    - name: Test with pytest
-#      run: |
-#        pytest -vv tests/evaluator.py
-##    - name: Upload to codecov
-##      run: |
-##        bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
 # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

-name: Tests
+name: Unit Tests

 on:
  push:
+    branches:
+      - big-refactor
  pull_request:
+    branches:
+      - big-refactor
  workflow_dispatch:

 jobs:
@@ -54,4 +58,4 @@ jobs:
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Test with pytest
-      run: python -m pytest -s -v -n=auto tests/test_evaluator.py tests/test_tasks.py
+      run: python -m pytest -s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra
--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
+def pytest_addoption(parser):
+    parser.addoption(
+        "--new_task",
+        action="store_true",
+        help="new_tasks_found",
+    )
--- a/tests/extra/__innit__.py
+++ b/tests/extra/__innit__.py
--- a/tests/extra/test_utils.py
+++ b/tests/extra/test_utils.py
+import json
+from typing import List
+from lm_eval.utils import load_yaml_config
+from pathlib import Path
+
+
+FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
+
+
+def load_changed_files(file_path: str = FILE_PATH) -> List[str]:
+    with open(file_path, "r") as f:
+        return [line.strip() for line in f.readlines()]
+
+
+def parser(full_path: List[str]) -> List[str]:
+    _output = set()
+    for x in full_path:
+        if x.endswith(".yaml"):
+            _output.add(load_yaml_config(x)["task"])
+        elif x.endswith(".py"):
+            path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
+            _output |= {load_yaml_config(x)["task"] for x in path}
+    return list(_output)
--- a/tests/hello.txt
+++ b/tests/hello.txt
+lm_eval/tasks/hellaswag/hellaswag.yaml
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -5,7 +5,7 @@ import lm_eval.api.registry as registry
 import lm_eval.tasks as tasks

 # import lm_eval.models as models
-
+import lm_eval.api as api
 import lm_eval.evaluator as evaluator
 import random
 import pytest
@@ -29,43 +29,6 @@ import pytest
 def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str):
    task_name = task_name
    limit = 10
-    model, model_args = model, model_args
-    # task_dict = tasks.get_task_dict(task)
-
-    # TODO: re-add cachingLM
-    # os.system("rm test_cache.db")
-    # lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
-    # lm = registry.get_model("dummy")()
-
-    # def ll_fn(reqs):
-    #     for ctx, cont in [req.args for req in reqs]:
-    #         if len(ctx) == 0:
-    #             continue
-    #         # space convention
-    #         assert ctx[-1] != " "
-    #         assert cont[0] == " " or ctx[-1] == "\n"
-    #
-    #     res = []
-    #
-    #     random.seed(42)
-    #     for _ in reqs:
-    #         res.append((-random.random(), False))
-    #
-    #     return res
-    #
-    # def ll_perp_fn(reqs):
-    #     for (string,) in reqs:
-    #         assert isinstance(string, str)
-    #
-    #     res = []
-    #     random.seed(42)
-    #     for _ in reqs:
-    #         res.append(-random.random())
-    #
-    #     return res
-    #
-    # lm.loglikelihood = ll_fn
-    # lm.loglikelihood_rolling = ll_perp_fn

    e1 = evaluator.simple_evaluate(
        model=model,
@@ -73,12 +36,31 @@ def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str
        limit=limit,
        model_args=model_args,
    )
-    e2 = evaluator.simple_evaluate(
-        model=model,
-        tasks=task_name,
+    assert e1 is not None
+
+    lm = api.registry.get_model(model).create_from_arg_string(
+        model_args,
+        {
+            "batch_size": None,
+            "max_batch_size": None,
+            "device": None,
+        },
+    )
+    task_dict = tasks.get_task_dict(task_name, num_fewshot=0)
+
+    e2 = evaluator.evaluate(
+        lm=lm,
+        task_dict=task_dict,
        limit=limit,
-        model_args=model_args,
    )

+    assert e2 is not None
    # check that caching is working
-    assert e1 == e2
+
+    def r(x):
+        return x["results"]["arc_easy"]
+
+    assert all(
+        x == y
+        for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
+    )
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
-import lm_eval.tasks as tasks
 import pytest
 from itertools import islice
+import lm_eval.tasks as tasks
+from tests.extra.test_utils import load_changed_files, parser
+from typing import List, ClassVar
+import os


 @pytest.fixture()
-def task_class(task_name="arc_easy"):
-    return next(
-        (name, cls) for name, cls in tasks.TASK_REGISTRY.items() if name == task_name
-    )[1]
+def any_new_tasks(request) -> bool:
+    return request.config.getoption("--new_task")
+
+
+# ["arc_easy] else get list of new tasks
+def new_tasks(any_new_tasks: bool) -> List[str]:
+    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
+    return [
+        parser(load_changed_files(FILENAME))
+        if any_new_tasks and os.path.exists(FILENAME)
+        else "arc_easy"
+    ]
+
+
+@pytest.fixture(params=new_tasks(any_new_tasks))
+def task_class(request):
+    task_name = request.param
+    return [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name][0]


 @pytest.fixture()
-def limit(limit=10):
-    return limit
+def limit(any_new_tasks: bool) -> int:
+    return 100 if any_new_tasks else 10
+
+
+# Tests


 def test_download(task_class):
@@ -25,7 +45,8 @@ def test_has_training_docs(task_class):


 def test_check_training_docs(task_class):
-    assert task_class().has_training_docs()
+    task = task_class()
+    assert task.has_training_docs() if task._config["training_split"] else True


 def test_has_validation_docs(task_class):
@@ -33,7 +54,10 @@ def test_has_validation_docs(task_class):


 def test_check_validation_docs(task_class):
-    assert task_class().has_training_docs()
+    task = task_class()
+    assert (
+        task_class().has_training_docs() if task._config["validation_split"] else True
+    )


 def test_has_test_docs(task_class):
@@ -41,28 +65,51 @@ def test_has_test_docs(task_class):


 def test_check_test_docs(task_class):
-    assert task_class().has_training_docs()
+    task = task_class()
+    assert task_class().has_training_docs() if task._config["test_split"] else True


 def test_should_decontaminate(task_class):
-    assert task_class().should_decontaminate() in [True, False]
+    task_class = task_class()
+    assert task_class.should_decontaminate() in [True, False]
+    if task_class.should_decontaminate():
+        assert task_class._config["doc_to_decontamination_query"] is not None


 def test_doc_to_text(task_class, limit):
-    task = task_class()
-    arr = list(islice(task.test_docs(), limit)) if limit else list(task.test_docs())
-    _array = [task.doc_to_text(doc) for doc in arr]
+    arr = (
+        list(islice(task_class().test_docs(), limit))
+        if limit
+        else list(task_class().test_docs())
+    )
+    _array = [task_class().doc_to_text(doc) for doc in arr]
    # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
    assert all(
        isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
    )


+def test_create_choices(task_class, limit):
+    arr = (
+        list(islice(task_class().test_docs(), limit))
+        if limit
+        else list(task_class().test_docs())
+    )
+    _array = [task_class().doc_to_choice(doc) for doc in arr]
+    # assert all(len(x) == 4 for x in _array)
+    assert all(isinstance(x, list) for x in _array)
+    assert all(isinstance(x[0], str) for x in _array)
+
+
 def test_doc_to_target(task_class, limit):
-    task = task_class()
-    arr = list(islice(task.test_docs(), limit)) if limit else list(task.test_target())
-    _array_target = [task.doc_to_target(doc) for doc in arr]
-    assert all(isinstance(doc, str) for doc in _array_target)
+    arr = (
+        list(islice(task_class().test_docs(), limit))
+        if limit
+        else list(task_class().test_target())
+    )
+    _array_target = [task_class().doc_to_target(doc) for doc in arr]
+    assert all(isinstance(label, int) for label in _array_target)
+    assert len(_array_target) == limit if limit else True
    # _array_text = [task.doc_to_text(doc) for doc in arr]
    # Not working
    # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
@@ -74,18 +121,24 @@ def test_build_all_requests(task_class, limit):


 def test_construct_requests(task_class, limit):
-    task = task_class()
-    arr = list(islice(task.test_docs(), limit)) if limit else list(task.test_docs())
-    requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
+    arr = (
+        list(islice(task_class().test_docs(), limit))
+        if limit
+        else list(task_class().test_docs())
+    )
+    requests = [
+        task_class().construct_requests(doc, task_class().doc_to_text(doc))
+        for doc in arr
+    ]
    assert all(isinstance(doc, list) for doc in requests)
    assert len(requests) == limit if limit else True


-def test_create_choices(task_class):
-    arr = list(islice(task_class().test_docs(), 1))
-    choices = task_class().create_choices(arr[0])
-    assert choices is not None
-    # checking if number of choices is correct
+# def test_create_choices(task_class):
+#     arr = list(islice(task_class().test_docs(), 1))
+#     choices = task_class().create_choices(arr[0])
+#     assert choices is not None
+# checking if number of choices is correct


 # @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())

--- a/tests/test_description.py
+++ b/tests/test_description.py
@@ -6,7 +6,7 @@ import lm_eval.models
 def test_description():
    seed = 42
    num_examples = 1
-    task_names = ["arc_challenge", "lambada"]
+    task_names = ["arc_challenge", "arc_easy"]
    description_dict = {
        "arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
        "lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
@@ -40,6 +40,5 @@ def test_description():
            ctx = task.fewshot_context(
                doc=doc,
                num_fewshot=1,
-                rnd=rnd,
            )
            assert description in ctx
--- a/tests/test_generate_13_grams.py
+++ b/tests/test_generate_13_grams.py
@@ -44,9 +44,9 @@ def test_generate_13_grams_1(caplog):
        pass
    os.makedirs(test_working_directory)

-    assert not os.path.exists("pile")
-    os.makedirs("pile")
-    archive = Archive(os.path.join("pile", "test.jsonl.zst"))
+    assert not os.path.exists("../pile")
+    os.makedirs("../pile")
+    archive = Archive(os.path.join("../pile", "test.jsonl.zst"))
    archive.add_data(data)
    archive.commit()


--- a/tests/test_models.py
+++ b/tests/test_models.py
--- a/tests/test_version_stable.py
+++ b/tests/test_version_stable.py