Unverified Commit 3d432b1a authored by Charles Foster's avatar Charles Foster Committed by GitHub
Browse files

Merge pull request #4 from EleutherAI/master

Update cfoster0 fork
parents 4a294d8a 4d8ed7d5
......@@ -2,9 +2,13 @@ import argparse
import json
import numpy as np
import random
import itertools
import collections
import logging
from lm_eval import models, tasks
from lm_eval import models, tasks, evaluator, base
logging.getLogger("openai").setLevel(logging.WARNING)
def parse_args():
parser = argparse.ArgumentParser()
......@@ -12,34 +16,30 @@ def parse_args():
parser.add_argument('--model_args', default="")
parser.add_argument('--tasks', default="all_tasks")
parser.add_argument('--provide_description', action="store_true")
parser.add_argument('--num_fewshot', type=int, default=1)
parser.add_argument('--num_fewshot', type=int, default=0)
parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--output_path', default=None)
parser.add_argument('--limit', type=int, default=None)
parser.add_argument('--cache', action="store_true")
return parser.parse_args()
def main():
args = parse_args()
random.seed(args.seed)
np.random.seed(args.seed)
lm = models.get_model(args.model).create_from_arg_string(args.model_args)
if args.cache:
lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db')
if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS
else:
task_names = args.tasks.split(",")
task_dict = tasks.get_task_dict(task_names)
results = {}
for task_name, task in task_dict.items():
if not task.has_validation_docs():
continue
result = task.evaluate(
docs=task.validation_docs(),
lm=lm,
provide_description=args.provide_description,
num_fewshot=args.num_fewshot,
)
results[task_name] = result
results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit)
dumped = json.dumps(results, indent=2)
print(dumped)
......@@ -47,6 +47,21 @@ def main():
with open(args.output_path, "w") as f:
f.write(dumped)
# MAKE TABLE
from pytablewriter import MarkdownTableWriter
writer = MarkdownTableWriter()
writer.headers = ["Task", "Metric", "Value"]
values = []
for k, dic in results.items():
for m, v in dic.items():
values.append([k, m, '%.4f' % v])
k = ""
writer.value_matrix = values
print(writer.dumps())
if __name__ == "__main__":
main()
black==20.8b1
best_download>=0.0.5
datasets>=1.2.1
click>=7.1
scikit-learn>=0.24.1
torch>=1.7
transformers>=4.1
sqlitedict==1.6.0
pytablewriter
\ No newline at end of file
from lm_eval import tasks
from pytablewriter import MarkdownTableWriter
writer = MarkdownTableWriter()
writer.headers = ["Task Name", "Train", "Val", "Test", "Metrics"]
values = []
def chk(tf):
if tf:
return '✓'
else:
return ' '
for tname, Task in tasks.TASK_REGISTRY.items():
task = Task()
values.append([tname,chk(task.has_training_docs()),chk(task.has_validation_docs()),chk(task.has_test_docs()),', '.join(task.aggregation().keys())])
writer.value_matrix = values
print(writer.dumps())
\ No newline at end of file
......@@ -4,6 +4,7 @@ import os
import random
from lm_eval import tasks
from lm_eval.utils import join_iters
EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
......@@ -13,6 +14,7 @@ def parse_args():
parser.add_argument('--output_base_path', required=True)
parser.add_argument('--tasks', default="all_tasks")
parser.add_argument('--provide_description', action="store_true")
parser.add_argument('--sets', type=str, default="val") # example: val,test
parser.add_argument('--num_fewshot', type=int, default=1)
parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--num_examples', type=int, default=1)
......@@ -31,12 +33,22 @@ def main():
task_dict = tasks.get_task_dict(task_names)
os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items():
if not task.has_validation_docs():
iters = []
for set in args.sets.split(","):
if set == 'train' and task.has_training_docs():
docs = task.training_docs()
else:
if set == 'val' and task.has_validation_docs():
docs = task.validation_docs()
if set == 'test' and task.has_test_docs():
docs = task.test_docs()
iters.append(docs)
docs = join_iters(iters)
with open(os.path.join(args.output_base_path, task_name), "w") as f:
for i, doc in zip(range(args.num_examples), docs):
for i, doc in zip(range(args.num_examples), docs) if args.num_examples > 0 else enumerate(docs):
f.write(EXAMPLE_DIVIDER.format(i=i))
ctx = task.fewshot_context(
doc=doc,
......
import setuptools
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
setuptools.setup(
name="lm_eval_harness",
version="0.0.1",
author="Leo Gao",
author_email="lg@eleuther.ai",
description="A framework for evaluating autoregressive language models",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/EleutherAI/lm-evaluation-harness",
packages=setuptools.find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
)
This diff is collapsed.
import lm_eval.models as models
import lm_eval.base as base
def test_gpt2():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
(ll_dog, ig_dog), (ll_cat, ig_cat) = gpt2.loglikelihood([
('The quick brown fox jumps over the lazy', ' dog'),
('The quick brown fox jumps over the lazy', ' cat'),
])
assert ll_dog > ll_cat
assert not ig_cat
# test empty context
gpt2.loglikelihood([('', 'test')])
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment