Unverified Commit e9d46380 authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge pull request #831 from baberabb/big-refactor_python_final

[Refactor] consolidated tasks tests
parents f86d6874 dfc47bd5
...@@ -3,10 +3,10 @@ name: Tasks Modified ...@@ -3,10 +3,10 @@ name: Tasks Modified
on: on:
push: push:
branches: branches:
- big-refactor - 'big-refactor*'
pull_request: pull_request:
branches: branches:
- big-refactor - 'big-refactor*'
workflow_dispatch: workflow_dispatch:
# comment/edit out the above to stop/change the triggers # comment/edit out the above to stop/change the triggers
jobs: jobs:
...@@ -18,7 +18,7 @@ jobs: ...@@ -18,7 +18,7 @@ jobs:
- name: checkout - name: checkout
uses: actions/checkout@v3 uses: actions/checkout@v3
with: with:
fetch-depth: 0 # OR "2" -> To retrieve the preceding commit. fetch-depth: 2 # OR "2" -> To retrieve the preceding commit.
# Uses the tj-actions/changed-files@v37 action to check for changes. # Uses the tj-actions/changed-files@v37 action to check for changes.
# Outputs provided here: https://github.com/tj-actions/changed-files#outputs # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
...@@ -51,6 +51,7 @@ jobs: ...@@ -51,6 +51,7 @@ jobs:
with: with:
python-version: 3.9 python-version: 3.9
cache: 'pip' cache: 'pip'
cache-dependency-path: setup.py
- name: Install dependencies - name: Install dependencies
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: | run: |
...@@ -62,10 +63,10 @@ jobs: ...@@ -62,10 +63,10 @@ jobs:
- name: Test with pytest - name: Test with pytest
# if new tasks are added, run tests on them # if new tasks are added, run tests on them
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto run: python -m pytest tests/test_tasks.py -s -vv -n=auto
# if api is modified, run tests on it # if api is modified, run tests on it
- name: Test more tasks with pytest - name: Test more tasks with pytest
env: env:
API: true API: true
if: steps.changed-tasks.outputs.api_any_modified == 'true' if: steps.changed-tasks.outputs.api_any_modified == 'true'
run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto run: python -m pytest tests/test_tasks.py -s -vv -n=auto
...@@ -26,7 +26,8 @@ jobs: ...@@ -26,7 +26,8 @@ jobs:
uses: actions/setup-python@v4 uses: actions/setup-python@v4
with: with:
python-version: 3.9 python-version: 3.9
cache: 'pip' cache: pip
cache-dependency-path: setup.py
- name: Install dependencies - name: Install dependencies
run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Pre-Commit - name: Pre-Commit
...@@ -46,22 +47,32 @@ jobs: ...@@ -46,22 +47,32 @@ jobs:
testcpu: testcpu:
name: CPU Tests name: CPU Tests
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 20 strategy:
matrix:
python-version: [ "3.9", "3.10", "3.11" ]
timeout-minutes: 30
steps: steps:
- name: Checkout Code - name: Checkout Code
uses: actions/checkout@v3 uses: actions/checkout@v3
- name: Set up Python 3.9 - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4 uses: actions/setup-python@v4
with: with:
python-version: 3.9 python-version: ${{ matrix.python-version }}
cache: 'pip' cache: pip
cache-dependency-path: setup.py
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies # Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest - name: Test with pytest
run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
- name: Archive artifacts
uses: actions/upload-artifact@v3
with:
name: output_results
path: |
test_logs/*
import pytest
from itertools import islice
import lm_eval.tasks as tasks
from .utilities_testing import load_changed_files, parser
from typing import List
from lm_eval.api.task import ConfigurableTask
import os
# GitHub CI
def new_tasks() -> List[str]:
FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
if os.path.exists(FILENAME):
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return parser(load_changed_files(FILENAME))
elif os.getenv("API") is not None:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return ["arc_easy", "hellaswag", "piqa", "wikitext"]
# if both not true just do arc_easy
else:
return ["arc_easy"]
def get_task_class() -> List[ConfigurableTask]:
task_name = new_tasks()
x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
return x
@pytest.fixture()
def limit() -> int:
return 10
# Tests
@pytest.mark.parametrize("task_class", get_task_class())
class TestNewTasks:
def test_download(self, task_class: ConfigurableTask):
task_class().download()
assert task_class().dataset is not None
def test_has_training_docs(self, task_class: ConfigurableTask):
assert task_class().has_training_docs() in [True, False]
def test_check_training_docs(self, task_class: ConfigurableTask):
task = task_class()
if task.has_training_docs():
assert task._config["training_split"] is not None
def test_has_validation_docs(self, task_class):
assert task_class().has_validation_docs() in [True, False]
def test_check_validation_docs(self, task_class):
task = task_class()
if task.has_validation_docs():
assert task._config["validation_split"] is not None
def test_has_test_docs(self, task_class):
assert task_class().has_test_docs() in [True, False]
def test_check_test_docs(self, task_class):
task = task_class()
if task.has_test_docs():
assert task._config["test_split"] is not None
def test_should_decontaminate(self, task_class):
task = task_class()
assert task.should_decontaminate() in [True, False]
if task.should_decontaminate():
assert task._config["doc_to_decontamination_query"] is not None
def test_doc_to_text(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all(
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
for x in _array
)
def test_create_choices(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
if "multiple_choice" in task._config.output_type:
_array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array)
assert all(isinstance(x, list) for x in _array)
assert all(isinstance(x[0], str) for x in _array)
def test_doc_to_target(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array_target = [task.doc_to_target(doc) for doc in arr]
if task._config.output_type == "multiple_choice":
assert all(isinstance(label, int) for label in _array_target)
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
def test_build_all_requests(self, task_class, limit):
task_class().build_all_requests(rank=1, limit=limit, world_size=1)
assert task_class.instances is not None
# ToDO: Add proper testing
def test_construct_requests(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
import json
from typing import List
from lm_eval.utils import load_yaml_config
from pathlib import Path
FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
def load_changed_files(file_path: str = FILE_PATH) -> List[str]:
with open(file_path, "r") as f:
return [l for line in f.readlines() for l in line.strip().split(" ")]
def parser(full_path: List[str]) -> List[str]:
_output = set()
for x in full_path:
if x.endswith(".yaml"):
_output.add(load_yaml_config(x)["task"])
elif x.endswith(".py"):
path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
_output |= {load_yaml_config(x)["task"] for x in path}
return list(_output)
from __future__ import annotations from __future__ import annotations
import pytest import pytest
from pathlib import Path
import numpy as np import numpy as np
from lm_eval.models.huggingface import HFLM from lm_eval.models.huggingface import HFLM
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
import lm_eval.tasks as tasks import lm_eval.tasks as tasks
import sys
import torch
class Test_HFLM: class Test_HFLM:
torch.use_deterministic_algorithms(True)
version_minor = sys.version_info.minor
multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1) multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: list[Instance] = multiple_choice_task.instances MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
...@@ -90,8 +94,15 @@ class Test_HFLM: ...@@ -90,8 +94,15 @@ class Test_HFLM:
def test_logliklihood(self) -> None: def test_logliklihood(self) -> None:
res = self.LM.loglikelihood(self.MULTIPLE_CH) res = self.LM.loglikelihood(self.MULTIPLE_CH)
_RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res] _RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res]
# change atol in case of consistent failure # log samples to CI
assert np.allclose(_res, _RES, atol=1e-4) dir_path = Path("test_logs")
dir_path.mkdir(parents=True, exist_ok=True)
file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
file_path = file_path.resolve()
with open(file_path, "w") as f:
f.write("\n".join(str(x) for x in _res))
assert np.allclose(_res, _RES, atol=1e-2)
# check indices for Multiple Choice # check indices for Multiple Choice
argmax_RES, argmax_res = np.argmax( argmax_RES, argmax_res = np.argmax(
np.array(_RES).reshape(-1, 4), axis=1 np.array(_RES).reshape(-1, 4), axis=1
......
from itertools import islice from itertools import islice
import pytest import pytest
from typing import List from .utils import new_tasks
import lm_eval.tasks as tasks import lm_eval.tasks as tasks
from lm_eval.api.task import ConfigurableTask from lm_eval.api.task import ConfigurableTask
# Using fixtures to get the task class and limit
@pytest.fixture() # Default Task
def task_class() -> ConfigurableTask: TASKS = ["arc_easy"]
task_name = ["arc_easy"]
x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
return x[0] def task_class():
global TASKS
# CI: new_tasks checks if any modifications have been made
task_classes = new_tasks()
# Check if task_classes is empty
if task_classes:
return [tasks.TASK_REGISTRY.get(x)() for x in task_classes]
else:
return [tasks.TASK_REGISTRY.get(x)() for x in TASKS]
@pytest.fixture() @pytest.fixture()
...@@ -18,109 +26,96 @@ def limit() -> int: ...@@ -18,109 +26,96 @@ def limit() -> int:
# Tests # Tests
@pytest.mark.parametrize("task_class", task_class())
class TestNewTasks:
def test_download(task_class: ConfigurableTask): def test_download(self, task_class: ConfigurableTask):
task_class().download() task_class.download()
assert task_class().dataset is not None assert task_class.dataset is not None
def test_has_training_docs(self, task_class: ConfigurableTask):
def test_has_training_docs(task_class: ConfigurableTask): assert task_class.has_training_docs() in [True, False]
assert task_class().has_training_docs() in [True, False]
def test_check_training_docs(self, task_class: ConfigurableTask):
if task_class.has_training_docs():
def test_check_training_docs(task_class: ConfigurableTask): assert task_class._config["training_split"] is not None
task = task_class()
if task.has_training_docs(): def test_has_validation_docs(self, task_class):
assert task._config["training_split"] is not None assert task_class.has_validation_docs() in [True, False]
def test_check_validation_docs(self, task_class):
def test_has_validation_docs(task_class): if task_class.has_validation_docs():
assert task_class().has_validation_docs() in [True, False] assert task_class._config["validation_split"] is not None
def test_has_test_docs(self, task_class):
def test_check_validation_docs(task_class): assert task_class.has_test_docs() in [True, False]
task = task_class()
if task.has_validation_docs(): def test_check_test_docs(self, task_class):
assert task._config["validation_split"] is not None task = task_class
if task.has_test_docs():
assert task._config["test_split"] is not None
def test_has_test_docs(task_class):
assert task_class().has_test_docs() in [True, False] def test_should_decontaminate(self, task_class):
task = task_class
assert task.should_decontaminate() in [True, False]
def test_check_test_docs(task_class): if task.should_decontaminate():
task = task_class() assert task._config["doc_to_decontamination_query"] is not None
if task.has_test_docs():
assert task._config["test_split"] is not None def test_doc_to_text(self, task_class, limit):
task = task_class
arr = (
def test_should_decontaminate(task_class): list(islice(task.test_docs(), limit))
task = task_class() if task.has_test_docs()
assert task.should_decontaminate() in [True, False] else list(islice(task.validation_docs(), limit))
if task.should_decontaminate(): )
assert task._config["doc_to_decontamination_query"] is not None _array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all(
def test_doc_to_text(task_class, limit): isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
task = task_class() for x in _array
arr = ( )
list(islice(task.test_docs(), limit))
if task.has_test_docs() def test_create_choices(self, task_class, limit):
else list(islice(task.validation_docs(), limit)) task = task_class
) arr = (
_array = [task.doc_to_text(doc) for doc in arr] list(islice(task.test_docs(), limit))
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on if task.has_test_docs()
assert all( else list(islice(task.validation_docs(), limit))
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array )
) if "multiple_choice" in task._config.output_type:
_array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array)
def test_create_choices(task_class, limit): assert all(isinstance(x, list) for x in _array)
task = task_class() assert all(isinstance(x[0], str) for x in _array)
arr = (
list(islice(task.test_docs(), limit)) def test_doc_to_target(self, task_class, limit):
if task.has_test_docs() task = task_class
else list(islice(task.validation_docs(), limit)) arr = (
) list(islice(task.test_docs(), limit))
if "multiple_choice" in task._config.output_type: if task.has_test_docs()
_array = [task.doc_to_choice(doc) for doc in arr] else list(islice(task.validation_docs(), limit))
# assert all(len(x) == 4 for x in _array) )
assert all(isinstance(x, list) for x in _array) _array_target = [task.doc_to_target(doc) for doc in arr]
assert all(isinstance(x[0], str) for x in _array) if task._config.output_type == "multiple_choice":
assert all(isinstance(label, int) for label in _array_target)
# _array_text = [task.doc_to_text(doc) for doc in arr]
def test_doc_to_target(task_class, limit): # Not working
task = task_class() # assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
arr = (
list(islice(task.test_docs(), limit)) def test_build_all_requests(self, task_class, limit):
if task.has_test_docs() task_class.build_all_requests(rank=1, limit=limit, world_size=1)
else list(islice(task.validation_docs(), limit)) assert task_class.instances is not None
)
_array_target = [task.doc_to_target(doc) for doc in arr] # ToDO: Add proper testing
if task._config.output_type == "multiple_choice": def test_construct_requests(self, task_class, limit):
assert all(isinstance(label, int) for label in _array_target) task = task_class
# _array_text = [task.doc_to_text(doc) for doc in arr] arr = (
# Not working list(islice(task.test_docs(), limit))
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target)) if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
def test_build_all_requests(task_class, limit): requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
task_class().build_all_requests(rank=1, limit=limit, world_size=1) # assert all(isinstance(doc, list) for doc in requests)
assert task_class.instances is not None assert len(requests) == limit if limit else True
# ToDO: Add proper testing
def test_construct_requests(task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
# def test_create_choices(task_class): # def test_create_choices(task_class):
......
import json
from typing import List from typing import List
from lm_eval.utils import load_yaml_config from lm_eval.utils import load_yaml_config
from pathlib import Path from pathlib import Path
import sys from typing import Union
import os
# {{{CI}}}
# This is the path where the output for the changed files for the tasks folder is stored # This is the path where the output for the changed files for the tasks folder is stored
# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt" # FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
# reads a text file and returns a list of words # reads a text file and returns a list of words
# used to read the output of the changed txt from tj-actions/changed-files # used to read the output of the changed txt from tj-actions/changed-files
def load_changed_files(file_path: str) -> List[str]: def load_changed_files(file_path: str) -> List[str]:
with open(file_path, "r") as f: with open(file_path, "r") as f:
content = f.read() content = f.read()
words_list = [x for x in content.split()] words_list = [x for x in content.split()]
sys.stdout.write(f"list of files: {words_list}")
return words_list return words_list
...@@ -30,3 +30,18 @@ def parser(full_path: List[str]) -> List[str]: ...@@ -30,3 +30,18 @@ def parser(full_path: List[str]) -> List[str]:
path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))] path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
_output |= {load_yaml_config(x)["task"] for x in path} _output |= {load_yaml_config(x)["task"] for x in path}
return list(_output) return list(_output)
def new_tasks() -> Union[list[str], None]:
FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
if os.path.exists(FILENAME):
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return parser(load_changed_files(FILENAME))
elif os.getenv("API") is not None:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return ["arc_easy", "hellaswag", "piqa", "wikitext"]
# if both not true just do arc_easy
else:
return
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment