Unverified Commit 3d432b1a authored by Charles Foster's avatar Charles Foster Committed by GitHub
Browse files

Merge pull request #4 from EleutherAI/master

Update cfoster0 fork
parents 4a294d8a 4d8ed7d5
...@@ -2,9 +2,13 @@ import argparse ...@@ -2,9 +2,13 @@ import argparse
import json import json
import numpy as np import numpy as np
import random import random
import itertools
import collections
import logging
from lm_eval import models, tasks from lm_eval import models, tasks, evaluator, base
logging.getLogger("openai").setLevel(logging.WARNING)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -12,34 +16,30 @@ def parse_args(): ...@@ -12,34 +16,30 @@ def parse_args():
parser.add_argument('--model_args', default="") parser.add_argument('--model_args', default="")
parser.add_argument('--tasks', default="all_tasks") parser.add_argument('--tasks', default="all_tasks")
parser.add_argument('--provide_description', action="store_true") parser.add_argument('--provide_description', action="store_true")
parser.add_argument('--num_fewshot', type=int, default=1) parser.add_argument('--num_fewshot', type=int, default=0)
parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--output_path', default=None) parser.add_argument('--output_path', default=None)
parser.add_argument('--limit', type=int, default=None)
parser.add_argument('--cache', action="store_true")
return parser.parse_args() return parser.parse_args()
def main(): def main():
args = parse_args() args = parse_args()
random.seed(args.seed) random.seed(args.seed)
np.random.seed(args.seed) np.random.seed(args.seed)
lm = models.get_model(args.model).create_from_arg_string(args.model_args) lm = models.get_model(args.model).create_from_arg_string(args.model_args)
if args.cache:
lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db')
if args.tasks == "all_tasks": if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS task_names = tasks.ALL_TASKS
else: else:
task_names = args.tasks.split(",") task_names = args.tasks.split(",")
task_dict = tasks.get_task_dict(task_names) task_dict = tasks.get_task_dict(task_names)
results = {}
for task_name, task in task_dict.items(): results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit)
if not task.has_validation_docs():
continue
result = task.evaluate(
docs=task.validation_docs(),
lm=lm,
provide_description=args.provide_description,
num_fewshot=args.num_fewshot,
)
results[task_name] = result
dumped = json.dumps(results, indent=2) dumped = json.dumps(results, indent=2)
print(dumped) print(dumped)
...@@ -47,6 +47,21 @@ def main(): ...@@ -47,6 +47,21 @@ def main():
with open(args.output_path, "w") as f: with open(args.output_path, "w") as f:
f.write(dumped) f.write(dumped)
# MAKE TABLE
from pytablewriter import MarkdownTableWriter
writer = MarkdownTableWriter()
writer.headers = ["Task", "Metric", "Value"]
values = []
for k, dic in results.items():
for m, v in dic.items():
values.append([k, m, '%.4f' % v])
k = ""
writer.value_matrix = values
print(writer.dumps())
if __name__ == "__main__": if __name__ == "__main__":
main() main()
black==20.8b1
best_download>=0.0.5
datasets>=1.2.1
click>=7.1
scikit-learn>=0.24.1
torch>=1.7
transformers>=4.1
sqlitedict==1.6.0
pytablewriter
\ No newline at end of file
from lm_eval import tasks
from pytablewriter import MarkdownTableWriter
writer = MarkdownTableWriter()
writer.headers = ["Task Name", "Train", "Val", "Test", "Metrics"]
values = []
def chk(tf):
if tf:
return '✓'
else:
return ' '
for tname, Task in tasks.TASK_REGISTRY.items():
task = Task()
values.append([tname,chk(task.has_training_docs()),chk(task.has_validation_docs()),chk(task.has_test_docs()),', '.join(task.aggregation().keys())])
writer.value_matrix = values
print(writer.dumps())
\ No newline at end of file
...@@ -4,6 +4,7 @@ import os ...@@ -4,6 +4,7 @@ import os
import random import random
from lm_eval import tasks from lm_eval import tasks
from lm_eval.utils import join_iters
EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n" EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
...@@ -13,6 +14,7 @@ def parse_args(): ...@@ -13,6 +14,7 @@ def parse_args():
parser.add_argument('--output_base_path', required=True) parser.add_argument('--output_base_path', required=True)
parser.add_argument('--tasks', default="all_tasks") parser.add_argument('--tasks', default="all_tasks")
parser.add_argument('--provide_description', action="store_true") parser.add_argument('--provide_description', action="store_true")
parser.add_argument('--sets', type=str, default="val") # example: val,test
parser.add_argument('--num_fewshot', type=int, default=1) parser.add_argument('--num_fewshot', type=int, default=1)
parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--num_examples', type=int, default=1) parser.add_argument('--num_examples', type=int, default=1)
...@@ -31,12 +33,22 @@ def main(): ...@@ -31,12 +33,22 @@ def main():
task_dict = tasks.get_task_dict(task_names) task_dict = tasks.get_task_dict(task_names)
os.makedirs(args.output_base_path, exist_ok=True) os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
if not task.has_validation_docs():
docs = task.training_docs() iters = []
else:
docs = task.validation_docs() for set in args.sets.split(","):
if set == 'train' and task.has_training_docs():
docs = task.training_docs()
if set == 'val' and task.has_validation_docs():
docs = task.validation_docs()
if set == 'test' and task.has_test_docs():
docs = task.test_docs()
iters.append(docs)
docs = join_iters(iters)
with open(os.path.join(args.output_base_path, task_name), "w") as f: with open(os.path.join(args.output_base_path, task_name), "w") as f:
for i, doc in zip(range(args.num_examples), docs): for i, doc in zip(range(args.num_examples), docs) if args.num_examples > 0 else enumerate(docs):
f.write(EXAMPLE_DIVIDER.format(i=i)) f.write(EXAMPLE_DIVIDER.format(i=i))
ctx = task.fewshot_context( ctx = task.fewshot_context(
doc=doc, doc=doc,
......
import setuptools
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
setuptools.setup(
name="lm_eval_harness",
version="0.0.1",
author="Leo Gao",
author_email="lg@eleuther.ai",
description="A framework for evaluating autoregressive language models",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/EleutherAI/lm-evaluation-harness",
packages=setuptools.find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
)
import lm_eval.tasks as tasks
import lm_eval.models as models
import lm_eval.evaluator as evaluator
import random
import pytest
# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, Task):
task_dict = tasks.get_task_dict([taskname])
lm = models.get_model('dummy')()
def ll_fn(reqs):
for ctx, cont in reqs:
# space convention
assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n'
res = []
random.seed(42)
for _ in reqs:
res.append((-random.random(), False))
return res
lm.loglikelihood = ll_fn
evaluator.evaluate(lm, task_dict, False, 0, 10)
\ No newline at end of file
import lm_eval.models as models
import lm_eval.base as base
def test_gpt2():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
(ll_dog, ig_dog), (ll_cat, ig_cat) = gpt2.loglikelihood([
('The quick brown fox jumps over the lazy', ' dog'),
('The quick brown fox jumps over the lazy', ' cat'),
])
assert ll_dog > ll_cat
assert not ig_cat
# test empty context
gpt2.loglikelihood([('', 'test')])
\ No newline at end of file
import lm_eval.tasks as tasks
import lm_eval.base as base
from itertools import islice
import pytest
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_basic_interface(taskname, Task):
print('Evaluating task', taskname)
#dl = Task.download
#Task.download = MagicMock()
task = Task()
#Task.download = dl
assert task.has_training_docs() in [True, False]
assert task.has_validation_docs() in [True, False]
assert task.has_test_docs() in [True, False]
assert isinstance(task.aggregation(), dict)
assert isinstance(task.higher_is_better(), dict)
assert task.aggregation().keys() == task.higher_is_better().keys()
for v in task.higher_is_better().values(): assert v in [True, False]
# test deterministic docs
# (don't test train because it's slow)
task2 = Task()
if task.has_validation_docs():
arr = list(islice(task.validation_docs(), 100))
arr2 = list(islice(task2.validation_docs(), 100))
assert arr == arr2
reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
assert reqs == reqs2
if task.has_test_docs():
arr = list(islice(task.test_docs(), 100))
arr2 = list(islice(task2.test_docs(), 100))
assert arr == arr2
reqs = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
reqs2 = [task2.construct_requests(doc, task2.doc_to_text(doc)) for doc in arr2]
assert reqs == reqs2
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_documents_and_requests(taskname, Task):
print('Evaluating task', taskname)
task = Task()
fns = []
if task.has_training_docs(): fns.append(task.training_docs)
if task.has_validation_docs(): fns.append(task.validation_docs)
# test doce might not have labels
#if task.has_test_docs(): fns.append(task.test_docs)
for fn in fns:
#print(list(islice(fn(), 10)))
for doc in islice(fn(), 10):
txt = task.doc_to_text(doc)
tgt = task.doc_to_target(doc)
assert isinstance(txt, str)
assert isinstance(tgt, str)
# space convention
assert txt[-1] != ' '
assert tgt[0] == ' ' or txt[-1] == '\n'
reqs = task.construct_requests(doc, txt)
# todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs:
assert isinstance(req, base.Request)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment