Commit 020ff063 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

Merge branch 'big-refactor' into cleanup

parents dad7e977 21882abe
name: Tasks Modified
on:
push:
branches:
- big-refactor
pull_request:
branches:
- big-refactor
workflow_dispatch:
jobs:
changed_files:
runs-on: ubuntu-latest # windows-latest || macos-latest
name: Scan for changed tasks
steps:
- name: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0 # OR "2" -> To retrieve the preceding commit.
# Example 1
- name: Check task folders
id: changed-tasks
uses: tj-actions/changed-files@v37.1.2
with:
files_yaml: |
tasks:
- lm_eval/tasks/**
api:
- lm_eval/api/**
write_output_files: true
- name: Run Tests
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: |
echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
echo "One or more test file(s) has changed."
echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
- name: Set up Python 3.9
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
uses: actions/setup-python@v4
with:
python-version: 3.9
cache: 'pip'
- name: Install dependencies
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: |
python -m pip install --upgrade pip
pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest
if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv -n=auto --new_task
- name: Test more tasks with pytest
env:
API: true
if: steps.changed-tasks.outputs.api_any_modified == 'true'
run: python -m pytest tests/test_api.py -s -vv -n=auto --new_task
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: Build
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Cache
uses: actions/cache@v2.1.3
with:
# A list of files, directories, and wildcard patterns to cache and restore
path: |
~/.cache
# An explicit key for restoring and saving the cache
key: evaldata-cache-4
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest pytest-cov
pip install -e .[dev,multilingual]
# Install optional git dependencies
pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest -vv --cov=lm_eval/ tests/
- name: Upload to codecov
run: |
bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Unit Tests
on:
push:
branches:
- big-refactor
pull_request:
branches:
- big-refactor
workflow_dispatch:
jobs:
linter:
name: Linters
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Lint with pylint
run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string=' ' **/*.py
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Lint with mypy
run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
testcpu:
name: CPU Tests
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest
run: python -m pytest -s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra
...@@ -6,7 +6,7 @@ from lm_eval.api.registry import register_model ...@@ -6,7 +6,7 @@ from lm_eval.api.registry import register_model
@register_model("dummy") @register_model("dummy")
class DummyLM(LM): class DummyLM(LM):
def __init__(self): def __init__(self):
pass super().__init__()
@classmethod @classmethod
def create_from_arg_string(cls, arg_string, additional_config=None): def create_from_arg_string(cls, arg_string, additional_config=None):
......
...@@ -115,8 +115,8 @@ def main(): ...@@ -115,8 +115,8 @@ def main():
if args.output_path: if args.output_path:
path = Path(args.output_path) path = Path(args.output_path)
# check if file or 'dir/results.jsonl' exists # check if file or 'dir/results.json' exists
if path.is_file() or Path(args.output_path).joinpath("results.jsonl").is_file(): if path.is_file() or Path(args.output_path).joinpath("results.json").is_file():
eval_logger.warning( eval_logger.warning(
f"File already exists at {path}. Results will be overwritten." f"File already exists at {path}. Results will be overwritten."
) )
......
...@@ -50,6 +50,13 @@ setuptools.setup( ...@@ -50,6 +50,13 @@ setuptools.setup(
], ],
extras_require={ extras_require={
"dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"], "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
"linting": [
"flake8",
"pylint",
"mypy",
"pre-commit",
],
"testing": ["pytest", "pytest-cov", "pytest-xdist"],
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"], "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"], "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
"promptsource": [ "promptsource": [
......
def pytest_addoption(parser):
parser.addoption(
"--new_task",
action="store_true",
help="new_tasks_found",
)
import json
from typing import List
from lm_eval.utils import load_yaml_config
from pathlib import Path
FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
def load_changed_files(file_path: str = FILE_PATH) -> List[str]:
with open(file_path, "r") as f:
return [line.strip() for line in f.readlines()]
def parser(full_path: List[str]) -> List[str]:
_output = set()
for x in full_path:
if x.endswith(".yaml"):
_output.add(load_yaml_config(x)["task"])
elif x.endswith(".py"):
path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
_output |= {load_yaml_config(x)["task"] for x in path}
return list(_output)
...@@ -5,7 +5,7 @@ import lm_eval.api.registry as registry ...@@ -5,7 +5,7 @@ import lm_eval.api.registry as registry
import lm_eval.tasks as tasks import lm_eval.tasks as tasks
# import lm_eval.models as models # import lm_eval.models as models
import lm_eval.api as api
import lm_eval.evaluator as evaluator import lm_eval.evaluator as evaluator
import random import random
import pytest import pytest
...@@ -15,60 +15,52 @@ import pytest ...@@ -15,60 +15,52 @@ import pytest
# test once we break evaluator into smaller, more manageable pieces # test once we break evaluator into smaller, more manageable pieces
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize(
def test_evaluator(taskname, task_class): "task_name,limit,model,model_args",
task_dict = tasks.get_task_dict([taskname]) [
(
# TODO: re-add cachingLM ["arc_easy"],
# os.system("rm test_cache.db") 10,
# lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db") "hf",
lm = registry.get_model("dummy")() "pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
)
def ll_fn(reqs): ],
for ctx, cont in reqs: )
if len(ctx) == 0: def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str):
continue task_name = task_name
# space convention
assert ctx[-1] != " "
assert cont[0] == " " or ctx[-1] == "\n"
res = []
random.seed(42)
for _ in reqs:
res.append((-random.random(), False))
return res
def ll_perp_fn(reqs):
for (string,) in reqs:
assert isinstance(string, str)
res = []
random.seed(42)
for _ in reqs:
res.append(-random.random())
return res
lm.loglikelihood = ll_fn
lm.loglikelihood_rolling = ll_perp_fn
limit = 10 limit = 10
e1 = evaluator.evaluate(
lm=lm, e1 = evaluator.simple_evaluate(
task_dict=task_dict, model=model,
num_fewshot=0, tasks=task_name,
limit=limit, limit=limit,
bootstrap_iters=10, model_args=model_args,
)
assert e1 is not None
lm = api.registry.get_model(model).create_from_arg_string(
model_args,
{
"batch_size": None,
"max_batch_size": None,
"device": None,
},
) )
task_dict = tasks.get_task_dict(task_name, num_fewshot=0)
e2 = evaluator.evaluate( e2 = evaluator.evaluate(
lm=lm, lm=lm,
task_dict=task_dict, task_dict=task_dict,
num_fewshot=0,
limit=limit, limit=limit,
bootstrap_iters=10,
) )
assert e2 is not None
# check that caching is working # check that caching is working
assert e1 == e2
def r(x):
return x["results"]["arc_easy"]
assert all(
x == y
for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
)
import lm_eval.tasks as tasks
import pytest import pytest
from itertools import islice from itertools import islice
import lm_eval.tasks as tasks
from tests.extra.test_utils import load_changed_files, parser
from typing import List, ClassVar
import os
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items()) @pytest.fixture()
def test_basic_interface(taskname, task_class): def any_new_tasks(request) -> bool:
print("Evaluating task", taskname) return request.config.getoption("--new_task")
task = task_class()
assert task.has_training_docs() in [True, False]
assert task.has_validation_docs() in [True, False]
assert task.has_test_docs() in [True, False]
assert isinstance(task.aggregation(), dict) # ["arc_easy] else get list of new tasks
assert isinstance(task.higher_is_better(), dict) def new_tasks(any_new_tasks: bool) -> List[str]:
assert task.aggregation().keys() == task.higher_is_better().keys() FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
if any_new_tasks and os.path.exists(FILENAME):
return [parser(load_changed_files(FILENAME))]
elif os.getenv("API") is not None:
return ["arc_easy", "hellaswag", "piqa", "wikitext"]
else:
return ["arc_easy"]
for v in task.higher_is_better().values():
assert v in [True, False]
assert isinstance(task.VERSION, int) @pytest.fixture(params=new_tasks(any_new_tasks))
def task_class(request):
task_name = request.param
return [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name][0]
# test deterministic docs
# (don't test train because it's slow)
task2 = task_class() @pytest.fixture()
def limit(any_new_tasks: bool) -> int:
return 100 if any_new_tasks else 10
limit = None
if taskname in ["triviaqa"] or taskname.startswith("pile_"): # Tests
limit = 10000
if task.has_validation_docs():
arr = list(islice(task.validation_docs(), limit))
arr2 = list(islice(task2.validation_docs(), limit))
assert arr == arr2
reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr] def test_download(task_class):
reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2] task_class().download()
assert task_class().dataset is not None
assert reqs == reqs2
if task.has_test_docs(): def test_has_training_docs(task_class):
arr = list(islice(task.test_docs(), limit)) assert task_class().has_training_docs() in [True, False]
arr2 = list(islice(task2.test_docs(), limit))
assert arr == arr2
reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr] def test_check_training_docs(task_class):
reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2] task = task_class()
assert task.has_training_docs() if task._config["training_split"] else True
assert reqs == reqs2 def test_has_validation_docs(task_class):
assert task_class().has_training_docs() in [True, False]
if task.has_training_docs():
arr = list(islice(task.training_docs(), limit))
arr2 = list(islice(task2.training_docs(), limit))
assert arr == arr2 def test_check_validation_docs(task_class):
task = task_class()
assert (
task_class().has_training_docs() if task._config["validation_split"] else True
)
reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
assert reqs == reqs2 def test_has_test_docs(task_class):
assert task_class().has_training_docs() in [True, False]
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items()) def test_check_test_docs(task_class):
def test_documents_and_requests(taskname, task_class):
print("Evaluating task", taskname)
task = task_class() task = task_class()
fns = [] assert task_class().has_training_docs() if task._config["test_split"] else True
if task.has_training_docs():
fns.append(task.training_docs)
if task.has_validation_docs(): def test_should_decontaminate(task_class):
fns.append(task.validation_docs) task_class = task_class()
# test doc might not have labels assert task_class.should_decontaminate() in [True, False]
# if task.has_test_docs(): fns.append(task.test_docs) if task_class.should_decontaminate():
assert task_class._config["doc_to_decontamination_query"] is not None
for fn in fns:
# print(list(islice(fn(), 10)))
for doc in islice(fn(), 10): def test_doc_to_text(task_class, limit):
arr = (
txt = task.doc_to_text(doc) list(islice(task_class().test_docs(), limit))
tgt = task.doc_to_target(doc) if limit
else list(task_class().test_docs())
assert isinstance(txt, str) )
assert isinstance(tgt, str) _array = [task_class().doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
# space convention assert all(
# allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
if len(txt) != 0: )
assert txt[-1] != " "
assert tgt[0] == " " or txt[-1] == "\n"
def test_create_choices(task_class, limit):
reqs = task.construct_requests(doc, txt) arr = (
list(islice(task_class().test_docs(), limit))
# construct_requests can return just one request if limit
if not isinstance(reqs, (list, tuple)): else list(task_class().test_docs())
reqs = [reqs] )
_array = [task_class().doc_to_choice(doc) for doc in arr]
# todo: mock lm after refactoring evaluator.py to not be a mess # assert all(len(x) == 4 for x in _array)
# for req in reqs: assert all(isinstance(x, list) for x in _array)
# assert isinstance(req, base.Request) assert all(isinstance(x[0], str) for x in _array)
def test_doc_to_target(task_class, limit):
arr = (
list(islice(task_class().test_docs(), limit))
if limit
else list(task_class().test_target())
)
_array_target = [task_class().doc_to_target(doc) for doc in arr]
assert all(isinstance(label, int) for label in _array_target)
assert len(_array_target) == limit if limit else True
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
def test_build_all_requests(task_class, limit):
task_class().build_all_requests(rank=1, limit=limit, world_size=1)
assert task_class.instances is not None
def test_construct_requests(task_class, limit):
arr = (
list(islice(task_class().test_docs(), limit))
if limit
else list(task_class().test_docs())
)
requests = [
task_class().construct_requests(doc, task_class().doc_to_text(doc))
for doc in arr
]
assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
# def test_create_choices(task_class):
# arr = list(islice(task_class().test_docs(), 1))
# choices = task_class().create_choices(arr[0])
# assert choices is not None
# checking if number of choices is correct
# @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
# def test_basic_interface(taskname, task_class):
# print("Evaluating task", taskname)
# task = task_class()
#
# assert task.has_training_docs() in [True, False]
# assert task.has_validation_docs() in [True, False]
# assert task.has_test_docs() in [True, False]
#
# assert isinstance(task.aggregation(), dict)
# assert isinstance(task.higher_is_better(), dict)
# assert task.aggregation().keys() == task.higher_is_better().keys()
#
# for v in task.higher_is_better().values():
# assert v in [True, False]
#
# assert isinstance(task.VERSION, int)
#
# # test deterministic docs
# # (don't test train because it's slow)
#
# task2 = task_class()
#
# limit = None
#
# if taskname in ["triviaqa"] or taskname.startswith("pile_"):
# limit = 10000
# if task.has_validation_docs():
# arr = list(islice(task.validation_docs(), limit))
# arr2 = list(islice(task2.validation_docs(), limit))
#
# assert arr == arr2
#
# reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
#
# assert reqs == reqs2
#
# if task.has_test_docs():
# arr = list(islice(task.test_docs(), limit))
# arr2 = list(islice(task2.test_docs(), limit))
#
# assert arr == arr2
#
# reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
#
# assert reqs == reqs2
#
# if task.has_training_docs():
# arr = list(islice(task.training_docs(), limit))
# arr2 = list(islice(task2.training_docs(), limit))
#
# assert arr == arr2
#
# reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
#
# assert reqs == reqs2
#
#
# @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
# def test_documents_and_requests(taskname, task_class):
# print("Evaluating task", taskname)
# task = task_class()
# fns = []
# if task.has_training_docs():
# fns.append(task.training_docs)
# if task.has_validation_docs():
# fns.append(task.validation_docs)
# # test doc might not have labels
# # if task.has_test_docs(): fns.append(task.test_docs)
#
# for fn in fns:
# # print(list(islice(fn(), 10)))
# for doc in islice(fn(), 10):
#
# txt = task.doc_to_text(doc)
# tgt = task.doc_to_target(doc)
#
# assert isinstance(txt, str)
# assert isinstance(tgt, str)
#
# # space convention
# # allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
# if len(txt) != 0:
# assert txt[-1] != " "
# assert tgt[0] == " " or txt[-1] == "\n"
#
# reqs = task.construct_requests(doc, txt)
#
# # construct_requests can return just one request
# if not isinstance(reqs, (list, tuple)):
# reqs = [reqs]
#
# # todo: mock lm after refactoring evaluator.py to not be a mess
# # for req in reqs:
# # assert isinstance(req, base.Request)
...@@ -6,7 +6,7 @@ import lm_eval.models ...@@ -6,7 +6,7 @@ import lm_eval.models
def test_description(): def test_description():
seed = 42 seed = 42
num_examples = 1 num_examples = 1
task_names = ["arc_challenge", "lambada"] task_names = ["arc_challenge", "arc_easy"]
description_dict = { description_dict = {
"arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.", "arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
"lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.", "lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
...@@ -40,6 +40,5 @@ def test_description(): ...@@ -40,6 +40,5 @@ def test_description():
ctx = task.fewshot_context( ctx = task.fewshot_context(
doc=doc, doc=doc,
num_fewshot=1, num_fewshot=1,
rnd=rnd,
) )
assert description in ctx assert description in ctx
...@@ -44,9 +44,9 @@ def test_generate_13_grams_1(caplog): ...@@ -44,9 +44,9 @@ def test_generate_13_grams_1(caplog):
pass pass
os.makedirs(test_working_directory) os.makedirs(test_working_directory)
assert not os.path.exists("pile") assert not os.path.exists("../pile")
os.makedirs("pile") os.makedirs("../pile")
archive = Archive(os.path.join("pile", "test.jsonl.zst")) archive = Archive(os.path.join("../pile", "test.jsonl.zst"))
archive.add_data(data) archive.add_data(data)
archive.commit() archive.commit()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment