Commit 9b933d96 authored by jeffhsu3's avatar jeffhsu3
Browse files

merged changes

parents c71dcb91 c0fbf9e8
...@@ -28,10 +28,10 @@ class BoolQ(HFTask): ...@@ -28,10 +28,10 @@ class BoolQ(HFTask):
return "Read the following passages and answer each question with a yes or a no." return "Read the following passages and answer each question with a yes or a no."
def doc_to_text(self, doc): def doc_to_text(self, doc):
return f"{doc['passage']}\nquestion: {doc['question']}\nanswer: " return f"{doc['passage']}\nquestion: {doc['question']}\nanswer:"
def doc_to_target(self, doc): def doc_to_target(self, doc):
return yesno(doc['label']) return " " + yesno(doc['label'])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
...@@ -156,12 +156,12 @@ class Copa(HFTask): ...@@ -156,12 +156,12 @@ class Copa(HFTask):
"cause": "because", "cause": "because",
"effect": "therefore", "effect": "therefore",
}[doc["question"]] }[doc["question"]]
return doc["premise"].strip()[:-1] + f" {connector} " return doc["premise"].strip()[:-1] + f" {connector}"
def doc_to_target(self, doc): def doc_to_target(self, doc):
correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"] correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
# Connect the sentences # Connect the sentences
return self.convert_choice(correct_choice) return " " + self.convert_choice(correct_choice)
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
choice1 = " " + self.convert_choice(doc["choice1"]) choice1 = " " + self.convert_choice(doc["choice1"])
...@@ -261,7 +261,7 @@ class ReCoRD(HFTask): ...@@ -261,7 +261,7 @@ class ReCoRD(HFTask):
return True return True
def has_test_docs(self): def has_test_docs(self):
return True return False
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out actual description # TODO: figure out actual description
...@@ -322,6 +322,7 @@ class ReCoRD(HFTask): ...@@ -322,6 +322,7 @@ class ReCoRD(HFTask):
# - Evaluate the accuracy and token F1 PER EXAMPLE # - Evaluate the accuracy and token F1 PER EXAMPLE
# - Average over all examples # - Average over all examples
max_idx = np.argmax(np.array(results)) max_idx = np.argmax(np.array(results))
prediction = doc["entities"][max_idx] prediction = doc["entities"][max_idx]
gold_label_set = list(set(doc["answers"])) gold_label_set = list(set(doc["answers"]))
f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set) f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
......
import os import os
import json import json
import random import random
from lm_eval.base import Dataset, mean, rf from lm_eval.base import Task, mean, rf
from ..utils import sh from ..utils import sh
class TriviaQA(Dataset): class TriviaQA(Task):
def download(self): def download(self):
if not os.path.exists('data/triviaqa'): if not os.path.exists('data/triviaqa'):
sh(""" sh("""
......
from . common import HFTask from . common import HFTask
from lm_eval.base import mean, rf
class WebQs(HFTask): class WebQs(HFTask):
DATASET_PATH = "web_questions" DATASET_PATH = "web_questions"
...@@ -18,7 +19,6 @@ class WebQs(HFTask): ...@@ -18,7 +19,6 @@ class WebQs(HFTask):
return "" return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
print(doc)
return "Q: " + doc['question'] + '\nA:' return "Q: " + doc['question'] + '\nA:'
def doc_to_target(self, doc): def doc_to_target(self, doc):
...@@ -27,47 +27,36 @@ class WebQs(HFTask): ...@@ -27,47 +27,36 @@ class WebQs(HFTask):
# TODO: make sure we're actually handling multi-answer correctly # TODO: make sure we're actually handling multi-answer correctly
return " " + doc['answers'][0] return " " + doc['answers'][0]
def construct_requests(self, doc, ctx): def _remove_prefixes(self, aliases):
""" Uses RequestFactory to construct Requests and returns an iterable of # Optimization: Remove any alias that has a strict prefix elsewhere in the list
Requests which will be sent to the LM. # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
aliases.sort()
ret = [aliases[0]]
for alias in aliases[1:]:
if not alias.startswith(ret[-1]):
ret.append(alias)
:param doc: return ret
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc: def construct_requests(self, doc, ctx):
The document as returned from training_docs, validation_docs, or test_docs. ret = []
:param results: for alias in self._remove_prefixes(doc['answers']):
The results of the requests created in construct_requests. _, is_prediction = rf.loglikelihood(ctx, " " + alias)
""" ret.append(is_prediction)
# TODO: implement evaluation. return ret
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
return {
"acc": float(any(results))
}
def aggregation(self): def aggregation(self):
""" return {
:returns: {str: [float] -> float} "acc": mean,
A dictionary where keys are the names of submetrics and values are }
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self): def higher_is_better(self):
""" return {
:returns: {str: bool} "acc": True
A dictionary where keys are the names of submetrics and values are }
whether a higher value of the submetric is better \ No newline at end of file
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
import numpy as np import numpy as np
from scipy.stats import pearsonr, spearmanr from . common import HFTask
from sklearn.metrics import f1_score, matthews_corrcoef from lm_eval.base import rf, mean
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno """
This evaluation of Winogrande uses partial evaluation as described by
Trinh & Le in Simple Method for Commonsense Reasoning (2018).
Reference: https://arxiv.org/abs/1806.02847
"""
class Winogrande(HFTask): class Winogrande(HFTask):
DATASET_PATH = "winogrande" DATASET_PATH = "winogrande"
...@@ -17,35 +22,31 @@ class Winogrande(HFTask): ...@@ -17,35 +22,31 @@ class Winogrande(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def training_docs(self):
if self.has_training_docs():
return self.data["train"]
def validation_docs(self):
if self.has_validation_docs():
return self.data["validation"]
def test_docs(self):
if self.has_test_docs():
return self.data["test"]
def fewshot_description(self): def fewshot_description(self):
# TODO: redo description # TODO: redo description
return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in." return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."
@classmethod
def partial_context(cls, doc):
# Substitute the pronoun in the sentence with each candidate choice
# and ignore everything after.
pronoun_loc = doc["sentence"].index("_")
context1 = doc["sentence"][:pronoun_loc] + doc["option1"]
context2 = doc["sentence"][:pronoun_loc] + doc["option2"]
return context1, context2
@classmethod
def partial_target(cls, doc):
# The target is everything after the document specified pronoun.
pronoun_loc = doc["sentence"].index("_") + 1
return doc["sentence"][pronoun_loc:].strip()
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc['sentence'] context1, context2 = self.partial_context(doc)
return context1 + '\n' + context2 + '\n'
def doc_to_target(self, doc): def doc_to_target(self, doc):
text = doc['sentence'] return self.partial_target(doc)
answer_n = doc['answer']
if answer_n == '1':
answer = doc['option1']
elif answer_n == '2':
answer = doc['option2']
else:
raise ValueError("Winogrande from HF datasets contained an invalid answer key")
return text.replace("_", answer)
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """ Uses RequestFactory to construct Requests and returns an iterable of
...@@ -58,8 +59,11 @@ class Winogrande(HFTask): ...@@ -58,8 +59,11 @@ class Winogrande(HFTask):
language description, as well as the few shot examples, and the question language description, as well as the few shot examples, and the question
part of the document for `doc`. part of the document for `doc`.
""" """
# TODO: implement evaluation. target = self.partial_target(doc)
raise NotImplementedError('Evaluation not implemented') context1, context2 = self.partial_context(doc)
ll_context1, _ = rf.loglikelihood(context1, " " + target)
ll_context2, _ = rf.loglikelihood(context2, " " + target)
return ll_context1, ll_context2
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
...@@ -71,8 +75,10 @@ class Winogrande(HFTask): ...@@ -71,8 +75,10 @@ class Winogrande(HFTask):
:param results: :param results:
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
# TODO: implement evaluation. answer = int(doc["answer"]) - 1 # `- 1` b/c doc["answer"] ∈ {'1', '2'}
raise NotImplementedError('Evaluation not implemented') return {
"acc": np.argmax(results) == answer
}
def aggregation(self): def aggregation(self):
""" """
...@@ -80,8 +86,9 @@ class Winogrande(HFTask): ...@@ -80,8 +86,9 @@ class Winogrande(HFTask):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics functions that aggregate a list of metrics
""" """
# TODO: implement evaluation. return {
raise NotImplementedError('Evaluation not implemented') "acc": mean
}
def higher_is_better(self): def higher_is_better(self):
""" """
...@@ -89,5 +96,6 @@ class Winogrande(HFTask): ...@@ -89,5 +96,6 @@ class Winogrande(HFTask):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better whether a higher value of the submetric is better
""" """
# TODO: implement evaluation. return {
raise NotImplementedError('Evaluation not implemented') "acc": True
}
import json import numpy as np
import random import random
import os from lm_eval.base import rf, mean
from lm_eval.base import Dataset from . common import HFTask
from ..utils import sh
"""
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847
"""
class WinogradSchemaChallenge273(HFTask):
DATASET_PATH = "winograd_wsc"
DATASET_NAME = "wsc273"
upper_pronouns = ["A", "An", "The", "She", "He",
"It", "They", "My", "His", "Her", "Their"]
class WinogradSchemaChallenge273(Dataset):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.data = self.__clean_data()
def download(self):
if not os.path.exists('data/wsc273'): def __clean_data(self):
sh(""" # The HF implementation of `wsc273` is not `partial evaluation` friendly.
mkdir -p data/wsc273 data = []
wget https://git.cse.msu.edu/bakerb15/nlp-final-project/raw/master/Winogard/reproduce/commonsense_test/wsc273.json -O data/wsc273/wsc273.json for doc in self.data["test"]:
""") doc["text"] = doc["text"].replace(" ", " ")
doc["options"][0] = self.__normalize_option(doc["options"][0], doc)
doc["options"][1] = self.__normalize_option(doc["options"][1], doc)
data.append(doc)
return {"test": data}
def __normalize_option(self, option, doc):
# Append `'s` to possessive determiner based options.
if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
option += "'s"
# Appropriately lowercase the pronoun in the option.
pronoun = option.split()[0]
start_of_sentence = doc["text"][doc['pronoun_loc'] - 2] == '.'
if not start_of_sentence and pronoun in self.upper_pronouns:
return option.replace(pronoun, pronoun.lower())
return option
def has_training_docs(self): def has_training_docs(self):
return False return False
...@@ -25,60 +51,35 @@ class WinogradSchemaChallenge273(Dataset): ...@@ -25,60 +51,35 @@ class WinogradSchemaChallenge273(Dataset):
def has_test_docs(self): def has_test_docs(self):
return True return True
def training_docs(self): def fewshot_examples(self, k):
return [] # NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset.
def validation_docs(self): return random.sample(list(self.test_docs()), k)
return []
def test_docs(self):
myjson = json.load(open('data/wsc273/wsc273.json'))
return self.load_doc(myjson)
def fewshot_description(self): def fewshot_description(self):
# TODO: redo description # TODO: redo description
return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False." return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
def load_doc(self, myjson): @classmethod
docs = [] def partial_context(cls, doc):
for i in range(0, 273 * 2, 2): # Substitute the pronoun in the original text with each candidate
item1 = myjson[i] # choice and ignore everything after.
item2 = myjson[i+1] context1 = doc["text"][:doc["pronoun_loc"]] + doc["options"][0]
context2 = doc["text"][:doc["pronoun_loc"]] + doc["options"][1]
if item1['question_id'] != item2['question_id']: return context1, context2
raise ValueError("WSC273 has missing completion pair.")
question_id = item1['question_id']
if item1['correctness'] == True: @classmethod
doc = { def partial_target(cls, doc):
'id': question_id, # The target is everything after the document specified pronoun.
'completions': { start_index = doc["pronoun_loc"] + len(doc["pronoun"])
'T': item1['substitution'], return doc["text"][start_index:].strip()
'F': item2['substitution'],
},
}
if item2['correctness'] == True:
doc = {
'id': question_id,
'completions': {
'F': item1['substitution'],
'T': item2['substitution'],
},
}
docs.append(doc)
return docs
def doc_to_text(self, doc): def doc_to_text(self, doc):
# TODO: implement context1, context2 = self.partial_context(doc)
pass return context1 + '\n' + context2 + '\n'
def doc_to_target(self, doc): def doc_to_target(self, doc):
# TODO: implement return self.partial_target(doc)
pass
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """ Uses RequestFactory to construct Requests and returns an iterable of
...@@ -91,8 +92,11 @@ class WinogradSchemaChallenge273(Dataset): ...@@ -91,8 +92,11 @@ class WinogradSchemaChallenge273(Dataset):
language description, as well as the few shot examples, and the question language description, as well as the few shot examples, and the question
part of the document for `doc`. part of the document for `doc`.
""" """
# TODO: implement evaluation. target = self.partial_target(doc)
raise NotImplementedError('Evaluation not implemented') context1, context2 = self.partial_context(doc)
ll_context1, _ = rf.loglikelihood(context1, " " + target)
ll_context2, _ = rf.loglikelihood(context2, " " + target)
return ll_context1, ll_context2
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
...@@ -104,8 +108,9 @@ class WinogradSchemaChallenge273(Dataset): ...@@ -104,8 +108,9 @@ class WinogradSchemaChallenge273(Dataset):
:param results: :param results:
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
# TODO: implement evaluation. return {
raise NotImplementedError('Evaluation not implemented') "acc": np.argmax(results) == doc["label"]
}
def aggregation(self): def aggregation(self):
""" """
...@@ -113,8 +118,9 @@ class WinogradSchemaChallenge273(Dataset): ...@@ -113,8 +118,9 @@ class WinogradSchemaChallenge273(Dataset):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics functions that aggregate a list of metrics
""" """
# TODO: implement evaluation. return {
raise NotImplementedError('Evaluation not implemented') "acc": mean
}
def higher_is_better(self): def higher_is_better(self):
""" """
...@@ -122,5 +128,6 @@ class WinogradSchemaChallenge273(Dataset): ...@@ -122,5 +128,6 @@ class WinogradSchemaChallenge273(Dataset):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better whether a higher value of the submetric is better
""" """
# TODO: implement evaluation. return {
raise NotImplementedError('Evaluation not implemented') "acc": True
}
...@@ -29,3 +29,14 @@ def simple_parse_args_string(args_string): ...@@ -29,3 +29,14 @@ def simple_parse_args_string(args_string):
def join_iters(iters): def join_iters(iters):
for iter in iters: for iter in iters:
yield from iter yield from iter
def chunks(iter, n):
arr = []
for x in iter:
arr.append(x)
if len(arr) == n:
yield arr
arr = []
if arr: yield arr
\ No newline at end of file
import os
from functools import reduce
import operator
from tqdm import tqdm
import json
class ExitCodeError(Exception):
pass
def sh(x):
if os.system(x):
raise ExitCodeError()
def ls(x):
return [x + '/' + fn for fn in os.listdir(x)]
def lsr(x):
if os.path.isdir(x):
return reduce(operator.add, map(lsr, ls(x)), [])
else:
return [x]
def fwrite(fname, content):
with open(fname, 'w') as fh:
fh.write(content)
def fread(fname):
with open(fname) as fh:
return fh.read()
class each:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return list(map(self.f, other))
class filt:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return list(filter(self.f, other))
class apply:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return self.f(other)
class one:
def __rrshift__(self, other):
try:
if isinstance(other, list):
assert len(other) == 1
return other[0]
return next(other)
except:
return None
class join:
def __init__(self, sep):
self.sep = sep
def __rrshift__(self, other):
if other is None:
return
try:
return self.sep.join(other)
except:
return None
Y = object()
def id(x):
return x
class Reflective:
def __getattribute__(self, f):
def _fn(*args, **kwargs):
return lambda x: x.__getattribute__(f)(*args, **kwargs)
return _fn
def __getitem__(self, a):
return lambda x: x[a]
def __mul__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return x * y
return _f
return lambda x: x * other
def __rmul__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return y * x
return _f
return lambda x: other * x
def __add__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return x + y
return _f
return lambda x: x + other
def __radd__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return y + x
return _f
return lambda x: other + x
# (b -> a -> b) -> b -> [a] -> b
def foldl(f, init, arr):
curr = init
for elem in arr:
curr = f(curr, elem)
return curr
# (a -> b -> b) -> b -> [a] -> b
def foldr(f, init, arr):
curr = init
for elem in arr[::-1]:
curr = f(elem, curr)
return curr
def comp(*fs):
if len(fs) == 1:
return fs[0]
def _f(x):
for f in fs[::-1]:
x = f(x)
return x
return _f
X = Reflective()
...@@ -4,9 +4,11 @@ import numpy as np ...@@ -4,9 +4,11 @@ import numpy as np
import random import random
import itertools import itertools
import collections import collections
import logging
from lm_eval import models, tasks from lm_eval import models, tasks, evaluator, base
logging.getLogger("openai").setLevel(logging.WARNING)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -18,102 +20,26 @@ def parse_args(): ...@@ -18,102 +20,26 @@ def parse_args():
parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--output_path', default=None) parser.add_argument('--output_path', default=None)
parser.add_argument('--limit', type=int, default=None) parser.add_argument('--limit', type=int, default=None)
parser.add_argument('--cache', action="store_true")
return parser.parse_args() return parser.parse_args()
def main(): def main():
args = parse_args() args = parse_args()
random.seed(args.seed) random.seed(args.seed)
np.random.seed(args.seed) np.random.seed(args.seed)
lm = models.get_model(args.model).create_from_arg_string(args.model_args) lm = models.get_model(args.model).create_from_arg_string(args.model_args)
if args.cache:
lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db')
if args.tasks == "all_tasks": if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS task_names = tasks.ALL_TASKS
else: else:
task_names = args.tasks.split(",") task_names = args.tasks.split(",")
task_dict = tasks.get_task_dict(task_names) task_dict = tasks.get_task_dict(task_names)
task_dict_items = [] results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit)
for name, task in task_dict.items():
if task.has_validation_docs():
task_dict_items.append((name, task, 'validation'))
elif task.has_test_docs():
task_dict_items.append((name, task, 'test'))
elif task.has_training_docs():
task_dict_items.append((name, task, 'training'))
results = collections.defaultdict(dict)
requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list)
# if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
# we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
# (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
# TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
docs = {}
# get lists of each type of requeste
for task_name, task, dset in task_dict_items:
if dset == 'training':
temp = task.training_docs()
elif dset == 'test':
temp = task.test_docs()
else:
temp = task.validation_docs()
for doc_id, doc in enumerate(itertools.islice(temp, 0, args.limit)):
docs[(task_name, doc_id)] = doc
ctx = task.fewshot_context(
doc=doc,
provide_description=args.provide_description,
num_fewshot=args.num_fewshot,
)
reqs = task.construct_requests(doc, ctx)
for i, req in enumerate(reqs):
requests[req.type].append(req)
# i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.type].append((i, task_name, doc, doc_id))
# all responses for each (task, doc)
process_res_queue = collections.defaultdict(list)
# execute each type of request
for reqtype, reqs in requests.items():
# TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a bandaid
# solution. we could also implement some kind of autogrouping here; they should end up next to each other.
resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
process_res_queue[(task_name, doc_id)].append((i, resp))
vals = collections.defaultdict(list)
# unpack results and sort back in order and return control to Task
for (task_name, doc_id), requests in process_res_queue.items():
requests.sort(key=lambda x: x[0])
requests = [x[1] for x in requests]
task = task_dict[task_name]
doc = docs[(task_name, doc_id)]
metrics = task.process_results(doc, requests)
for metric, value in metrics.items():
vals[(task_name, metric)].append(value)
# aggregate results
for (task_name, metric), items in vals.items():
task = task_dict[task_name]
results[task_name][metric] = task.aggregation()[metric](items)
dumped = json.dumps(results, indent=2) dumped = json.dumps(results, indent=2)
print(dumped) print(dumped)
......
...@@ -5,3 +5,4 @@ click>=7.1 ...@@ -5,3 +5,4 @@ click>=7.1
scikit-learn>=0.24.1 scikit-learn>=0.24.1
torch>=1.7 torch>=1.7
transformers>=4.1 transformers>=4.1
sqlitedict==1.6.0
\ No newline at end of file
from lm_eval import tasks
from pytablewriter import MarkdownTableWriter
writer = MarkdownTableWriter()
writer.headers = ["Task Name", "Train", "Val", "Test", "Metrics"]
values = []
def chk(tf):
if tf:
return '✓'
else:
return ' '
for tname, Task in tasks.TASK_REGISTRY.items():
task = Task()
values.append([tname,chk(task.has_training_docs()),chk(task.has_validation_docs()),chk(task.has_test_docs()),', '.join(task.aggregation().keys())])
writer.value_matrix = values
print(writer.dumps())
\ No newline at end of file
import lm_eval.tasks as tasks
import lm_eval.models as models
import lm_eval.evaluator as evaluator
import random
import pytest
# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_evaluator(taskname, Task):
task_dict = tasks.get_task_dict([taskname])
lm = models.get_model('dummy')()
def ll_fn(reqs):
for ctx, cont in reqs:
# space convention
assert ctx[-1] != ' '
assert cont[0] == ' ' or ctx[-1] == '\n'
res = []
random.seed(42)
for _ in reqs:
res.append((-random.random(), False))
return res
lm.loglikelihood = ll_fn
evaluator.evaluate(lm, task_dict, False, 0, 10)
\ No newline at end of file
import lm_eval.models as models
import lm_eval.base as base
def test_gpt2():
gpt2 = models.get_model('gpt2').create_from_arg_string("device=cpu")
(ll_dog, ig_dog), (ll_cat, ig_cat) = gpt2.loglikelihood([
('The quick brown fox jumps over the lazy', ' dog'),
('The quick brown fox jumps over the lazy', ' cat'),
])
assert ll_dog > ll_cat
assert not ig_cat
import lm_eval.tasks as tasks import lm_eval.tasks as tasks
import lm_eval.base as base import lm_eval.base as base
from unittest.mock import MagicMock
from itertools import islice from itertools import islice
import pytest import pytest
...@@ -8,10 +7,10 @@ import pytest ...@@ -8,10 +7,10 @@ import pytest
@pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items()) @pytest.mark.parametrize("taskname,Task", tasks.TASK_REGISTRY.items())
def test_basic_interface(taskname, Task): def test_basic_interface(taskname, Task):
print('Evaluating task', taskname) print('Evaluating task', taskname)
dl = Task.download #dl = Task.download
Task.download = MagicMock() #Task.download = MagicMock()
task = Task() task = Task()
Task.download = dl #Task.download = dl
assert task.has_training_docs() in [True, False] assert task.has_training_docs() in [True, False]
assert task.has_validation_docs() in [True, False] assert task.has_validation_docs() in [True, False]
...@@ -44,8 +43,12 @@ def test_documents_and_requests(taskname, Task): ...@@ -44,8 +43,12 @@ def test_documents_and_requests(taskname, Task):
assert isinstance(txt, str) assert isinstance(txt, str)
assert isinstance(tgt, str) assert isinstance(tgt, str)
# space convention
assert txt[-1] != ' '
assert tgt[0] == ' ' or txt[-1] == '\n'
reqs = task.construct_requests(doc, txt) reqs = task.construct_requests(doc, txt)
# todo: mock lm by pluggin what's currently in main.py in here # todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs: for req in reqs:
assert isinstance(req, base.Request) assert isinstance(req, base.Request)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment