Commit 3d0282cf authored by baberabb's avatar baberabb
Browse files

new_tasks have own tests now

parent a199314a
...@@ -4,10 +4,11 @@ on: ...@@ -4,10 +4,11 @@ on:
push: push:
pull_request: pull_request:
workflow_dispatch: workflow_dispatch:
# comment/edit out the above to stop/change the trigger
jobs: jobs:
changed_files: changed_files:
runs-on: ubuntu-latest # windows-latest || macos-latest runs-on: ubuntu-latest # windows-latest || macos-latest
timeout-minutes: 120
name: Scan for changed tasks name: Scan for changed tasks
steps: steps:
- name: checkout - name: checkout
...@@ -17,8 +18,8 @@ jobs: ...@@ -17,8 +18,8 @@ jobs:
# Uses the tj-actions/changed-files@v37 action to check for changes. # Uses the tj-actions/changed-files@v37 action to check for changes.
# Outputs provided here: https://github.com/tj-actions/changed-files#outputs # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
# The `files_yaml` input optionally takes a yaml string to specify which files to check, # The `files_yaml` input optionally takes a yaml string to specify filters,
# and prepends the name to the standard output names. # and prepends the filter name to the standard output names.
- name: Check task folders - name: Check task folders
id: changed-tasks id: changed-tasks
uses: tj-actions/changed-files@v37.1.2 uses: tj-actions/changed-files@v37.1.2
...@@ -31,14 +32,15 @@ jobs: ...@@ -31,14 +32,15 @@ jobs:
- lm_eval/api/** - lm_eval/api/**
write_output_files: true write_output_files: true
# This is a workaround to get the list of all modified files and save it to an env variable. # The next step is optional; the files are written to the workspace by default (above).
# The next two echo statements are just for logging. # so it's just for debugging
- name: Run Tests - name: Run Tests
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: | run: |
echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV' echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
echo "One or more test file(s) has changed." echo "One or more test file(s) has changed."
echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}" echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
- name: Set up Python 3.9 - name: Set up Python 3.9
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
uses: actions/setup-python@v4 uses: actions/setup-python@v4
...@@ -53,7 +55,7 @@ jobs: ...@@ -53,7 +55,7 @@ jobs:
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest - name: Test with pytest
# if new tasks are added, run tests on them; pytest file=test_new_tasks # if new tasks are added, run tests on them
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
# if api is modified, run tests on it # if api is modified, run tests on it
......
...@@ -2,137 +2,126 @@ import pytest ...@@ -2,137 +2,126 @@ import pytest
from itertools import islice from itertools import islice
import lm_eval.tasks as tasks import lm_eval.tasks as tasks
from .utilities_testing import load_changed_files, parser from .utilities_testing import load_changed_files, parser
from typing import List, ClassVar from typing import List
from lm_eval.api.task import ConfigurableTask from lm_eval.api.task import ConfigurableTask
import os import os
# GitHub CI # GitHub CI
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
# Or if API has changed then we set the ENV variable API to True
# and run some given extended tasks
def new_tasks() -> List[str]: def new_tasks() -> List[str]:
FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt" FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
if os.path.exists(FILENAME): if os.path.exists(FILENAME):
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return parser(load_changed_files(FILENAME)) return parser(load_changed_files(FILENAME))
elif os.getenv("API") is not None: elif os.getenv("API") is not None:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return ["arc_easy", "hellaswag", "piqa", "wikitext"] return ["arc_easy", "hellaswag", "piqa", "wikitext"]
# if both not true just do arc_easy # if both not true just do arc_easy
else: else:
return ["arc_easy"] return ["arc_easy"]
@pytest.fixture(params=new_tasks()) def get_task_class() -> List[ConfigurableTask]:
def task_class(request) -> ConfigurableTask: task_name = new_tasks()
task_name = request.param x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
if task_name is None: return x
task_name = "arc_easy"
x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name == task_name]
return x[0]
@pytest.fixture(params=new_tasks()) @pytest.fixture()
def limit(request) -> int: def limit() -> int:
# not used; just for consistency return 10
return 100
# Tests # Tests
def test_download(task_class: ConfigurableTask): @pytest.mark.parametrize("task_class", get_task_class())
task_class().download() class TestNewTasks:
assert task_class().dataset is not None def test_download(self, task_class: ConfigurableTask):
task_class().download()
assert task_class().dataset is not None
def test_has_training_docs(task_class: ConfigurableTask):
assert task_class().has_training_docs() in [True, False] def test_has_training_docs(self, task_class: ConfigurableTask):
assert task_class().has_training_docs() in [True, False]
def test_check_training_docs(task_class: ConfigurableTask): def test_check_training_docs(self, task_class: ConfigurableTask):
task = task_class() task = task_class()
assert task.has_training_docs() if task._config["training_split"] else True if task.has_training_docs():
assert task._config["training_split"] is not None
def test_has_validation_docs(task_class): def test_has_validation_docs(self, task_class):
assert task_class().has_training_docs() in [True, False] assert task_class().has_validation_docs() in [True, False]
def test_check_validation_docs(self, task_class):
def test_check_validation_docs(task_class): task = task_class()
task = task_class() if task.has_validation_docs():
assert ( assert task._config["validation_split"] is not None
task_class().has_training_docs() if task._config["validation_split"] else True
) def test_has_test_docs(self, task_class):
assert task_class().has_test_docs() in [True, False]
def test_has_test_docs(task_class): def test_check_test_docs(self, task_class):
assert task_class().has_training_docs() in [True, False] task = task_class()
if task.has_test_docs():
assert task._config["test_split"] is not None
def test_check_test_docs(task_class):
task = task_class() def test_should_decontaminate(self, task_class):
assert task_class().has_training_docs() if task._config["test_split"] else True task = task_class()
assert task.should_decontaminate() in [True, False]
if task.should_decontaminate():
def test_should_decontaminate(task_class): assert task._config["doc_to_decontamination_query"] is not None
task_class = task_class()
assert task_class.should_decontaminate() in [True, False] def test_doc_to_text(self, task_class, limit):
if task_class.should_decontaminate(): task = task_class()
assert task_class._config["doc_to_decontamination_query"] is not None arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
def test_doc_to_text(task_class, limit): else list(islice(task.validation_docs(), limit))
arr = ( )
list(islice(task_class().test_docs(), limit)) _array = [task.doc_to_text(doc) for doc in arr]
if limit # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
else list(task_class().test_docs()) assert all(
) isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
_array = [task_class().doc_to_text(doc) for doc in arr] for x in _array
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on )
assert all(
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array def test_create_choices(self, task_class, limit):
) task = task_class()
arr = (
list(islice(task.test_docs(), limit))
def test_create_choices(task_class, limit): if task.has_test_docs()
arr = ( else list(islice(task.validation_docs(), limit))
list(islice(task_class().test_docs(), limit)) )
if limit _array = [task.doc_to_choice(doc) for doc in arr]
else list(task_class().test_docs()) # assert all(len(x) == 4 for x in _array)
) assert all(isinstance(x, list) for x in _array)
_array = [task_class().doc_to_choice(doc) for doc in arr] assert all(isinstance(x[0], str) for x in _array)
# assert all(len(x) == 4 for x in _array)
assert all(isinstance(x, list) for x in _array) def test_doc_to_target(self, task_class, limit):
assert all(isinstance(x[0], str) for x in _array) task = task_class()
arr = (
list(islice(task.test_docs(), limit))
def test_doc_to_target(task_class, limit): if task.has_test_docs()
arr = ( else list(islice(task.validation_docs(), limit))
list(islice(task_class().test_docs(), limit)) )
if limit _array_target = [task.doc_to_target(doc) for doc in arr]
else list(task_class().test_target()) assert all(isinstance(label, int) for label in _array_target)
) assert len(_array_target) == limit if limit else True
_array_target = [task_class().doc_to_target(doc) for doc in arr] # _array_text = [task.doc_to_text(doc) for doc in arr]
assert all(isinstance(label, int) for label in _array_target) # Not working
assert len(_array_target) == limit if limit else True # assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working def test_build_all_requests(self, task_class, limit):
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target)) task_class().build_all_requests(rank=1, limit=limit, world_size=1)
assert task_class.instances is not None
def test_build_all_requests(task_class, limit): def test_construct_requests(self, task_class, limit):
task_class().build_all_requests(rank=1, limit=limit, world_size=1) task = task_class()
assert task_class.instances is not None arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
def test_construct_requests(task_class, limit): else list(islice(task.validation_docs(), limit))
arr = ( )
list(islice(task_class().test_docs(), limit)) requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
if limit assert all(isinstance(doc, list) for doc in requests)
else list(task_class().test_docs()) assert len(requests) == limit if limit else True
)
requests = [
task_class().construct_requests(doc, task_class().doc_to_text(doc))
for doc in arr
]
assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
...@@ -2,14 +2,20 @@ import json ...@@ -2,14 +2,20 @@ import json
from typing import List from typing import List
from lm_eval.utils import load_yaml_config from lm_eval.utils import load_yaml_config
from pathlib import Path from pathlib import Path
import sys
# This is the path where the output for the changed files for the tasks folder is stored # This is the path where the output for the changed files for the tasks folder is stored
FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt" # FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
def load_changed_files(file_path: str = FILE_PATH) -> List[str]: # reads a text file and returns a list of words
# used to read the output of the changed txt from tj-actions/changed-files
def load_changed_files(file_path: str) -> List[str]:
with open(file_path, "r") as f: with open(file_path, "r") as f:
return [line.strip() for line in f.readlines()] content = f.read()
words_list = [x for x in content.split()]
sys.stdout.write(f"list of files: {words_list}")
return words_list
# checks the txt file for list of changed files. # checks the txt file for list of changed files.
......
...@@ -14,8 +14,8 @@ def task_class(task_name: List[str] = None) -> ConfigurableTask: ...@@ -14,8 +14,8 @@ def task_class(task_name: List[str] = None) -> ConfigurableTask:
@pytest.fixture() @pytest.fixture()
def limit(any_new_tasks: bool) -> int: def limit() -> int:
return 100 if any_new_tasks else 10 return 10
# Tests # Tests
...@@ -32,43 +32,45 @@ def test_has_training_docs(task_class: ConfigurableTask): ...@@ -32,43 +32,45 @@ def test_has_training_docs(task_class: ConfigurableTask):
def test_check_training_docs(task_class: ConfigurableTask): def test_check_training_docs(task_class: ConfigurableTask):
task = task_class() task = task_class()
assert task.has_training_docs() if task._config["training_split"] else True if task.has_training_docs():
assert task._config["training_split"] is not None
def test_has_validation_docs(task_class): def test_has_validation_docs(task_class):
assert task_class().has_training_docs() in [True, False] assert task_class().has_validation_docs() in [True, False]
def test_check_validation_docs(task_class): def test_check_validation_docs(task_class):
task = task_class() task = task_class()
assert ( if task.has_validation_docs():
task_class().has_training_docs() if task._config["validation_split"] else True assert task._config["validation_split"] is not None
)
def test_has_test_docs(task_class): def test_has_test_docs(task_class):
assert task_class().has_training_docs() in [True, False] assert task_class().has_test_docs() in [True, False]
def test_check_test_docs(task_class): def test_check_test_docs(task_class):
task = task_class() task = task_class()
assert task_class().has_training_docs() if task._config["test_split"] else True if task.has_test_docs():
assert task._config["test_split"] is not None
def test_should_decontaminate(task_class): def test_should_decontaminate(task_class):
task_class = task_class() task = task_class()
assert task_class.should_decontaminate() in [True, False] assert task.should_decontaminate() in [True, False]
if task_class.should_decontaminate(): if task.should_decontaminate():
assert task_class._config["doc_to_decontamination_query"] is not None assert task._config["doc_to_decontamination_query"] is not None
def test_doc_to_text(task_class, limit): def test_doc_to_text(task_class, limit):
task = task_class()
arr = ( arr = (
list(islice(task_class().test_docs(), limit)) list(islice(task.test_docs(), limit))
if limit if task.has_test_docs()
else list(task_class().test_docs()) else list(islice(task.validation_docs(), limit))
) )
_array = [task_class().doc_to_text(doc) for doc in arr] _array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all( assert all(
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
...@@ -76,24 +78,26 @@ def test_doc_to_text(task_class, limit): ...@@ -76,24 +78,26 @@ def test_doc_to_text(task_class, limit):
def test_create_choices(task_class, limit): def test_create_choices(task_class, limit):
task = task_class()
arr = ( arr = (
list(islice(task_class().test_docs(), limit)) list(islice(task.test_docs(), limit))
if limit if task.has_test_docs()
else list(task_class().test_docs()) else list(islice(task.validation_docs(), limit))
) )
_array = [task_class().doc_to_choice(doc) for doc in arr] _array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array) # assert all(len(x) == 4 for x in _array)
assert all(isinstance(x, list) for x in _array) assert all(isinstance(x, list) for x in _array)
assert all(isinstance(x[0], str) for x in _array) assert all(isinstance(x[0], str) for x in _array)
def test_doc_to_target(task_class, limit): def test_doc_to_target(task_class, limit):
task = task_class()
arr = ( arr = (
list(islice(task_class().test_docs(), limit)) list(islice(task.test_docs(), limit))
if limit if task.has_test_docs()
else list(task_class().test_target()) else list(islice(task.validation_docs(), limit))
) )
_array_target = [task_class().doc_to_target(doc) for doc in arr] _array_target = [task.doc_to_target(doc) for doc in arr]
assert all(isinstance(label, int) for label in _array_target) assert all(isinstance(label, int) for label in _array_target)
assert len(_array_target) == limit if limit else True assert len(_array_target) == limit if limit else True
# _array_text = [task.doc_to_text(doc) for doc in arr] # _array_text = [task.doc_to_text(doc) for doc in arr]
...@@ -107,15 +111,13 @@ def test_build_all_requests(task_class, limit): ...@@ -107,15 +111,13 @@ def test_build_all_requests(task_class, limit):
def test_construct_requests(task_class, limit): def test_construct_requests(task_class, limit):
task = task_class()
arr = ( arr = (
list(islice(task_class().test_docs(), limit)) list(islice(task.test_docs(), limit))
if limit if task.has_test_docs()
else list(task_class().test_docs()) else list(islice(task.validation_docs(), limit))
) )
requests = [ requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
task_class().construct_requests(doc, task_class().doc_to_text(doc))
for doc in arr
]
assert all(isinstance(doc, list) for doc in requests) assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True assert len(requests) == limit if limit else True
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment