Commit 884c29fb authored by Charles Foster's avatar Charles Foster
Browse files

Bring SQuAD fork up to date with EAI upstream

parents 232c9ab6 8809c5f1
import argparse
import json
import numpy as np
import random
import itertools
import collections
import logging
from lm_eval import models, tasks, evaluator, base
import random
from lm_eval.base import LM
import transformers
class DryrunLM(LM):
def __init__(self):
self.tokencost = 0
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
self.tokenizer.pad_token = "<|endoftext|>"
@classmethod
def create_from_arg_string(cls, arg_string):
return cls()
def loglikelihood(self, requests):
res = []
for ctx, cont in requests:
res.append((-random.random(), False))
self.tokencost += len(self.tokenizer.tokenize(ctx + cont))
return res
def greedy_until(self, requests):
res = []
for ctx, until in requests:
res.append("lol")
# assume worst case - generates until 256
self.tokencost += len(self.tokenizer.tokenize(ctx)) + 256
return res
def main():
lm = DryrunLM()
values = []
for taskname in list(tasks.TASK_REGISTRY.keys()):
lm.tokencost = 0
evaluator.evaluate(lm, {taskname: tasks.get_task(taskname)()}, False, 0, None)
print(taskname, lm.tokencost)
values.append([taskname, lm.tokencost, lm.tokencost / 1000 * 0.06])
from pytablewriter import MarkdownTableWriter
writer = MarkdownTableWriter()
writer.headers = ["Task", "Tokens", "Davinci Cost"]
values.sort(key=lambda x: -x[1])
totcost = sum([x[1] for x in values])
values.append(["**Total**", totcost, totcost / 1000 * 0.06])
writer.value_matrix = values
print(writer.dumps())
if __name__ == "__main__":
main()
import argparse
import json
import numpy as np
import random
import itertools
import collections
import logging
from lm_eval import models, tasks, evaluator, base
logging.getLogger("openai").setLevel(logging.WARNING)
fewshot_descriptions = [
"foo",
"bar"
]
task = "lambada"
num_fewshot = 0
model = "gpt2"
model_args = ""
limit = None
no_cache = False
class CustomDescTask:
def __init__(self, task, desc):
self.task = task
self.desc = desc
def fewshot_description():
return self.desc
self.task.fewshot_description = fewshot_description
def __getattr__(self, attr):
return getattr(self.task, attr)
def main():
random.seed(42)
np.random.seed(42)
lm = models.get_model(model).create_from_arg_string(model_args)
if limit:
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
if not no_cache:
lm = base.CachingLM(lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_') + '.db')
task_dict = tasks.get_task_dict([task])
for desc in fewshot_descriptions:
custom_task_dict = {k: CustomDescTask(v, desc) for k, v in task_dict.items()}
results = evaluator.evaluate(lm, custom_task_dict, True, num_fewshot, limit)
dumped = json.dumps(results, indent=2)
print('Description:', desc)
print(dumped)
# MAKE TABLE
from pytablewriter import MarkdownTableWriter
writer = MarkdownTableWriter()
writer.headers = ["Task", "Metric", "Value"]
values = []
for k, dic in results.items():
for m, v in dic.items():
values.append([k, m, '%.4f' % v])
k = ""
writer.value_matrix = values
print(writer.dumps())
if __name__ == "__main__":
main()
from lm_eval import tasks
from itertools import islice
ct = 3
for tname, Task in tasks.TASK_REGISTRY.items():#[('record', tasks.superglue.ReCoRD)]:#
task = Task()
print('#', tname)
docs = islice(task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct)
print()
print('**Zero-Shot Prompt**:', "\n```\n" + task.fewshot_description() + "\n```\n")
for i in range(ct):
print()
doc = next(docs)
print("**Context**:", "\n```\n" + task.doc_to_text(doc) + "\n```\n")
print()
print('**Target**:', "\n```\n" + task.doc_to_target(doc) + "\n```\n")
print()
......@@ -12,4 +12,10 @@ def test_gpt2():
assert not ig_cat
# test empty context
gpt2.loglikelihood([('', 'test')])
\ No newline at end of file
gpt2.loglikelihood([('', 'test')])
gen, = gpt2.greedy_until([
('The quick brown fox jumps over the lazy', ['.', '\n'])
])
assert gen == ', lazy fox and they both fall to the ground'
\ No newline at end of file
......@@ -75,6 +75,9 @@ def test_documents_and_requests(taskname, Task):
assert tgt[0] == ' ' or txt[-1] == '\n'
reqs = task.construct_requests(doc, txt)
# construct_requests can return just one request
if not isinstance(reqs, (list, tuple)): reqs = [reqs]
# todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment