"examples/offline_inference/structured_outputs.py" did not exist on "7b55581a62bc88a66338fc44d1fcf5ade218ed43"
Commit b7cd829b authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into benchmark-scripts
parents 2d96a8c8 4e44f0aa
......@@ -132,8 +132,8 @@ def main():
if args.output_path:
path = Path(args.output_path)
# check if file or 'dir/results.jsonl' exists
if path.is_file() or Path(args.output_path).joinpath("results.jsonl").is_file():
# check if file or 'dir/results.json' exists
if path.is_file() or Path(args.output_path).joinpath("results.json").is_file():
eval_logger.warning(
f"File already exists at {path}. Results will be overwritten."
)
......
......@@ -18,6 +18,9 @@ setuptools.setup(
"lm_eval": ["**/*.yaml"],
"examples": ["**/*.yaml"],
},
entry_points={
"console_scripts": ["lm-eval = main:main", "lm_eval = main:main"],
},
include_package_data=True,
classifiers=[
"Development Status :: 3 - Alpha",
......@@ -50,6 +53,13 @@ setuptools.setup(
],
extras_require={
"dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
"linting": [
"flake8",
"pylint",
"mypy",
"pre-commit",
],
"testing": ["pytest", "pytest-cov", "pytest-xdist"],
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
"promptsource": [
......
import pytest
from itertools import islice
import lm_eval.tasks as tasks
from .utilities_testing import load_changed_files, parser
from typing import List
from lm_eval.api.task import ConfigurableTask
import os
# GitHub CI
def new_tasks() -> List[str]:
FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
if os.path.exists(FILENAME):
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return parser(load_changed_files(FILENAME))
elif os.getenv("API") is not None:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return ["arc_easy", "hellaswag", "piqa", "wikitext"]
# if both not true just do arc_easy
else:
return ["arc_easy"]
def get_task_class() -> List[ConfigurableTask]:
task_name = new_tasks()
x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
return x
@pytest.fixture()
def limit() -> int:
return 10
# Tests
@pytest.mark.parametrize("task_class", get_task_class())
class TestNewTasks:
def test_download(self, task_class: ConfigurableTask):
task_class().download()
assert task_class().dataset is not None
def test_has_training_docs(self, task_class: ConfigurableTask):
assert task_class().has_training_docs() in [True, False]
def test_check_training_docs(self, task_class: ConfigurableTask):
task = task_class()
if task.has_training_docs():
assert task._config["training_split"] is not None
def test_has_validation_docs(self, task_class):
assert task_class().has_validation_docs() in [True, False]
def test_check_validation_docs(self, task_class):
task = task_class()
if task.has_validation_docs():
assert task._config["validation_split"] is not None
def test_has_test_docs(self, task_class):
assert task_class().has_test_docs() in [True, False]
def test_check_test_docs(self, task_class):
task = task_class()
if task.has_test_docs():
assert task._config["test_split"] is not None
def test_should_decontaminate(self, task_class):
task = task_class()
assert task.should_decontaminate() in [True, False]
if task.should_decontaminate():
assert task._config["doc_to_decontamination_query"] is not None
def test_doc_to_text(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all(
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
for x in _array
)
def test_create_choices(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
if "multiple_choice" in task._config.group:
_array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array)
assert all(isinstance(x, list) for x in _array)
assert all(isinstance(x[0], str) for x in _array)
def test_doc_to_target(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array_target = [task.doc_to_target(doc) for doc in arr]
assert all(isinstance(label, int) for label in _array_target)
assert len(_array_target) == limit if limit else True
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
def test_build_all_requests(self, task_class, limit):
task_class().build_all_requests(rank=1, limit=limit, world_size=1)
assert task_class.instances is not None
def test_construct_requests(self, task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
import json
from typing import List
from lm_eval.utils import load_yaml_config
from pathlib import Path
import sys
# This is the path where the output for the changed files for the tasks folder is stored
# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
# reads a text file and returns a list of words
# used to read the output of the changed txt from tj-actions/changed-files
def load_changed_files(file_path: str) -> List[str]:
with open(file_path, "r") as f:
content = f.read()
words_list = [x for x in content.split()]
sys.stdout.write(f"list of files: {words_list}")
return words_list
# checks the txt file for list of changed files.
# if file ends with .yaml then check yaml for task name
# if file ends with .py then parse the folder for all yaml files
def parser(full_path: List[str]) -> List[str]:
_output = set()
for x in full_path:
if x.endswith(".yaml"):
_output.add(load_yaml_config(x)["task"])
elif x.endswith(".py"):
path = [str(x) for x in (list(Path(x).parent.glob("*.yaml")))]
_output |= {load_yaml_config(x)["task"] for x in path}
return list(_output)
......@@ -5,7 +5,7 @@ import lm_eval.api.registry as registry
import lm_eval.tasks as tasks
# import lm_eval.models as models
import lm_eval.api as api
import lm_eval.evaluator as evaluator
import random
import pytest
......@@ -15,60 +15,52 @@ import pytest
# test once we break evaluator into smaller, more manageable pieces
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, task_class):
task_dict = tasks.get_task_dict([taskname])
# TODO: re-add cachingLM
# os.system("rm test_cache.db")
# lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
lm = registry.get_model("dummy")()
def ll_fn(reqs):
for ctx, cont in reqs:
if len(ctx) == 0:
continue
# space convention
assert ctx[-1] != " "
assert cont[0] == " " or ctx[-1] == "\n"
res = []
random.seed(42)
for _ in reqs:
res.append((-random.random(), False))
return res
def ll_perp_fn(reqs):
for (string,) in reqs:
assert isinstance(string, str)
res = []
random.seed(42)
for _ in reqs:
res.append(-random.random())
return res
lm.loglikelihood = ll_fn
lm.loglikelihood_rolling = ll_perp_fn
@pytest.mark.parametrize(
"task_name,limit,model,model_args",
[
(
["arc_easy"],
10,
"hf",
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
)
],
)
def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str):
task_name = task_name
limit = 10
e1 = evaluator.evaluate(
lm=lm,
task_dict=task_dict,
num_fewshot=0,
e1 = evaluator.simple_evaluate(
model=model,
tasks=task_name,
limit=limit,
bootstrap_iters=10,
model_args=model_args,
)
assert e1 is not None
lm = api.registry.get_model(model).create_from_arg_string(
model_args,
{
"batch_size": None,
"max_batch_size": None,
"device": None,
},
)
task_dict = tasks.get_task_dict(task_name, num_fewshot=0)
e2 = evaluator.evaluate(
lm=lm,
task_dict=task_dict,
num_fewshot=0,
limit=limit,
bootstrap_iters=10,
)
assert e2 is not None
# check that caching is working
assert e1 == e2
def r(x):
return x["results"]["arc_easy"]
assert all(
x == y
for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
)
import lm_eval.tasks as tasks
import pytest
from itertools import islice
import pytest
from typing import List
import lm_eval.tasks as tasks
from lm_eval.api.task import ConfigurableTask
# Using fixtures to get the task class and limit
@pytest.fixture()
def task_class() -> ConfigurableTask:
task_name = ["arc_easy"]
x = [cls for name, cls in tasks.TASK_REGISTRY.items() if name in task_name]
return x[0]
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_basic_interface(taskname, task_class):
print("Evaluating task", taskname)
task = task_class()
assert task.has_training_docs() in [True, False]
assert task.has_validation_docs() in [True, False]
assert task.has_test_docs() in [True, False]
assert isinstance(task.aggregation(), dict)
assert isinstance(task.higher_is_better(), dict)
assert task.aggregation().keys() == task.higher_is_better().keys()
for v in task.higher_is_better().values():
assert v in [True, False]
@pytest.fixture()
def limit() -> int:
return 10
assert isinstance(task.VERSION, int)
# test deterministic docs
# (don't test train because it's slow)
# Tests
task2 = task_class()
limit = None
def test_download(task_class: ConfigurableTask):
task_class().download()
assert task_class().dataset is not None
if taskname in ["triviaqa"] or taskname.startswith("pile_"):
limit = 10000
if task.has_validation_docs():
arr = list(islice(task.validation_docs(), limit))
arr2 = list(islice(task2.validation_docs(), limit))
assert arr == arr2
def test_has_training_docs(task_class: ConfigurableTask):
assert task_class().has_training_docs() in [True, False]
reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
assert reqs == reqs2
def test_check_training_docs(task_class: ConfigurableTask):
task = task_class()
if task.has_training_docs():
assert task._config["training_split"] is not None
if task.has_test_docs():
arr = list(islice(task.test_docs(), limit))
arr2 = list(islice(task2.test_docs(), limit))
assert arr == arr2
def test_has_validation_docs(task_class):
assert task_class().has_validation_docs() in [True, False]
reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
assert reqs == reqs2
def test_check_validation_docs(task_class):
task = task_class()
if task.has_validation_docs():
assert task._config["validation_split"] is not None
if task.has_training_docs():
arr = list(islice(task.training_docs(), limit))
arr2 = list(islice(task2.training_docs(), limit))
assert arr == arr2
def test_has_test_docs(task_class):
assert task_class().has_test_docs() in [True, False]
reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
assert reqs == reqs2
def test_check_test_docs(task_class):
task = task_class()
if task.has_test_docs():
assert task._config["test_split"] is not None
@pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
def test_documents_and_requests(taskname, task_class):
print("Evaluating task", taskname)
def test_should_decontaminate(task_class):
task = task_class()
fns = []
if task.has_training_docs():
fns.append(task.training_docs)
if task.has_validation_docs():
fns.append(task.validation_docs)
# test doc might not have labels
# if task.has_test_docs(): fns.append(task.test_docs)
for fn in fns:
# print(list(islice(fn(), 10)))
for doc in islice(fn(), 10):
assert task.should_decontaminate() in [True, False]
if task.should_decontaminate():
assert task._config["doc_to_decontamination_query"] is not None
txt = task.doc_to_text(doc)
tgt = task.doc_to_target(doc)
assert isinstance(txt, str)
assert isinstance(tgt, str)
def test_doc_to_text(task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all(
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) for x in _array
)
def test_create_choices(task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
if "multiple_choice" in task._config.group:
_array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array)
assert all(isinstance(x, list) for x in _array)
assert all(isinstance(x[0], str) for x in _array)
def test_doc_to_target(task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array_target = [task.doc_to_target(doc) for doc in arr]
assert all(isinstance(label, int) for label in _array_target)
assert len(_array_target) == limit if limit else True
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
# space convention
# allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
if len(txt) != 0:
assert txt[-1] != " "
assert tgt[0] == " " or txt[-1] == "\n"
reqs = task.construct_requests(doc, txt)
def test_build_all_requests(task_class, limit):
task_class().build_all_requests(rank=1, limit=limit, world_size=1)
assert task_class.instances is not None
# construct_requests can return just one request
if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
# todo: mock lm after refactoring evaluator.py to not be a mess
# for req in reqs:
# assert isinstance(req, base.Request)
def test_construct_requests(task_class, limit):
task = task_class()
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
# def test_create_choices(task_class):
# arr = list(islice(task_class().test_docs(), 1))
# choices = task_class().create_choices(arr[0])
# assert choices is not None
# checking if number of choices is correct
# @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
# def test_basic_interface(taskname, task_class):
# print("Evaluating task", taskname)
# task = task_class()
#
# assert task.has_training_docs() in [True, False]
# assert task.has_validation_docs() in [True, False]
# assert task.has_test_docs() in [True, False]
#
# assert isinstance(task.aggregation(), dict)
# assert isinstance(task.higher_is_better(), dict)
# assert task.aggregation().keys() == task.higher_is_better().keys()
#
# for v in task.higher_is_better().values():
# assert v in [True, False]
#
# assert isinstance(task.VERSION, int)
#
# # test deterministic docs
# # (don't test train because it's slow)
#
# task2 = task_class()
#
# limit = None
#
# if taskname in ["triviaqa"] or taskname.startswith("pile_"):
# limit = 10000
# if task.has_validation_docs():
# arr = list(islice(task.validation_docs(), limit))
# arr2 = list(islice(task2.validation_docs(), limit))
#
# assert arr == arr2
#
# reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
#
# assert reqs == reqs2
#
# if task.has_test_docs():
# arr = list(islice(task.test_docs(), limit))
# arr2 = list(islice(task2.test_docs(), limit))
#
# assert arr == arr2
#
# reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
#
# assert reqs == reqs2
#
# if task.has_training_docs():
# arr = list(islice(task.training_docs(), limit))
# arr2 = list(islice(task2.training_docs(), limit))
#
# assert arr == arr2
#
# reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
#
# assert reqs == reqs2
#
#
# @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
# def test_documents_and_requests(taskname, task_class):
# print("Evaluating task", taskname)
# task = task_class()
# fns = []
# if task.has_training_docs():
# fns.append(task.training_docs)
# if task.has_validation_docs():
# fns.append(task.validation_docs)
# # test doc might not have labels
# # if task.has_test_docs(): fns.append(task.test_docs)
#
# for fn in fns:
# # print(list(islice(fn(), 10)))
# for doc in islice(fn(), 10):
#
# txt = task.doc_to_text(doc)
# tgt = task.doc_to_target(doc)
#
# assert isinstance(txt, str)
# assert isinstance(tgt, str)
#
# # space convention
# # allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
# if len(txt) != 0:
# assert txt[-1] != " "
# assert tgt[0] == " " or txt[-1] == "\n"
#
# reqs = task.construct_requests(doc, txt)
#
# # construct_requests can return just one request
# if not isinstance(reqs, (list, tuple)):
# reqs = [reqs]
#
# # todo: mock lm after refactoring evaluator.py to not be a mess
# # for req in reqs:
# # assert isinstance(req, base.Request)
......@@ -6,7 +6,7 @@ import lm_eval.models
def test_description():
seed = 42
num_examples = 1
task_names = ["arc_challenge", "lambada"]
task_names = ["arc_challenge", "arc_easy"]
description_dict = {
"arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
"lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
......@@ -40,6 +40,5 @@ def test_description():
ctx = task.fewshot_context(
doc=doc,
num_fewshot=1,
rnd=rnd,
)
assert description in ctx
......@@ -44,9 +44,9 @@ def test_generate_13_grams_1(caplog):
pass
os.makedirs(test_working_directory)
assert not os.path.exists("pile")
os.makedirs("pile")
archive = Archive(os.path.join("pile", "test.jsonl.zst"))
assert not os.path.exists("../pile")
os.makedirs("../pile")
archive = Archive(os.path.join("../pile", "test.jsonl.zst"))
archive.add_data(data)
archive.commit()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment