Commit 8ac99269 authored by Jonathan Tow's avatar Jonathan Tow
Browse files

Replace the `fewshot_description` API with a `description_dict` based interface

parent b67aec37
......@@ -2,6 +2,7 @@ import abc
import random
import numpy as np
import re
from lm_eval import tasks
from lm_eval.metrics import mean, perplexity, weighted_perplexity, weighted_mean
......@@ -224,11 +225,15 @@ class Task(abc.ABC):
pass
def fewshot_description(self):
import warnings
warnings.warn(
"`fewshot_description` will be removed in coming versions. Pass " \
"any custom descriptions to the `evaluate` function instead.",
DeprecationWarning)
return ""
def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
raw_description = self.fewshot_description()
description = (raw_description + "\n===\n\n") if provide_description and raw_description else ""
def fewshot_context(self, doc, num_fewshot, rnd, description=None):
description = description + "\n\n" if description else ""
if num_fewshot == 0:
labeled_examples = ""
......@@ -295,16 +300,13 @@ class PerplexityTask(Task, abc.ABC):
def has_training_docs(self):
return False
def fewshot_description(self):
return ""
def fewshot_examples(self, k, rnd):
assert k == 0
return []
def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
def fewshot_context(self, doc, num_fewshot, rnd, description=None):
assert num_fewshot == 0
assert not provide_description
assert description is None
return ""
def higher_is_better(self):
......
import collections
import itertools
import json
import random
import lm_eval.metrics
import lm_eval.models
......@@ -7,7 +8,7 @@ import lm_eval.tasks
import lm_eval.base
import numpy as np
def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=None, device=None, no_cache=False, limit=None, bootstrap_iters=100000):
def simple_evaluate(model, model_args, task_names, description_path=None, num_fewshot=0, batch_size=None, device=None, no_cache=False, limit=None, bootstrap_iters=100000):
random.seed(1234)
np.random.seed(1234)
......@@ -19,7 +20,12 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
lm = lm_eval.base.CachingLM(lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db')
task_dict = lm_eval.tasks.get_task_dict(task_names)
results = evaluate(lm, task_dict, False, num_fewshot, limit)
description_dict = {}
if description_path:
with open(description_path, 'r') as f:
description_dict = json.load(f)
results = evaluate(lm, task_dict, num_fewshot, limit, description_dict)
# add info about the model and few shot config
results["config"] = {
......@@ -28,6 +34,8 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
"num_fewshot": num_fewshot,
"batch_size": batch_size,
"device": device,
# TODO (jon-tow): Should we add the description info to `results["config"]`?
# "description_dict": description_dict,
"no_cache": no_cache,
"limit": limit,
"bootstrap_iters": bootstrap_iters
......@@ -36,9 +44,7 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
return results
def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000):
assert not provide_description # not implemented. todo: implement proper description-providing system
def evaluate(lm, task_dict, num_fewshot, limit, description_dict=None, bootstrap_iters=100000):
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
......@@ -73,16 +79,16 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
rnd.seed(42)
rnd.shuffle(task_docs)
description = description_dict[task_name] if description_dict and task_name in description_dict else ""
for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
docs[(task_name, doc_id)] = doc
ctx = task.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
rnd=rnd
rnd=rnd,
description=description
)
reqs = task.construct_requests(doc, ctx)
if not isinstance(reqs, (list, tuple)): reqs = [reqs]
for i, req in enumerate(reqs):
......@@ -168,4 +174,4 @@ def make_table(result_dict):
# todo: make latex table look good
# print(latex_writer.dumps())
return md_writer.dumps()
\ No newline at end of file
return md_writer.dumps()
......@@ -33,10 +33,6 @@ class ANLIBase(HFTask):
if self.has_test_docs():
return self.data["test_r" + str(self.SPLIT)]
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc):
# OA does this a bit weirdly: they prepend "anli 1: anli 1: " to the beginning
# of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly
......
......@@ -29,10 +29,6 @@ class ARCEasy(HFTask, MultipleChoiceTask):
}
return out_doc
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc):
return doc["query"]
......
......@@ -17,10 +17,6 @@ class CBTBase(HFTask):
VERSION = 0
def fewshot_description(self):
# TODO: Figure out description.
return ""
def detokenize(self, text):
text = text.replace(" '", "'")
text = text.replace(" \n", "\n")
......
......@@ -36,10 +36,7 @@ class CoQA(Task):
def test_docs(self):
pass
def fewshot_description(self):
return "Given a passage and a conversation so far, answer the next question in the conversation."
def doc_to_text(self, doc):
# Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1}
# and a question qi, the task is to predict the answer ai
......
......@@ -40,10 +40,6 @@ class DROP(Task):
def has_test_docs(self):
return False
def fewshot_description(self):
# TODO: figure out description
return ""
def _load_docs(self, docs):
for doc in docs:
for qa in doc["qa_pairs"]:
......
......@@ -21,10 +21,6 @@ class CoLA(HFTask):
def has_test_docs(self):
return False
def fewshot_description(self):
# TODO
return ""
def doc_to_text(self, doc):
return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])
......@@ -69,9 +65,6 @@ class SST(HFTask):
def has_test_docs(self):
return False
def fewshot_description(self):
return "Indicate if the sentiment of each sentence is positive or negative."
def doc_to_text(self, doc):
return "{}\nQuestion: Is this sentence positive or negative?\nAnswer:".format(
general_detokenize(doc["sentence"]),
......@@ -342,9 +335,6 @@ class MRPC(HFTask):
def has_test_docs(self):
return False
def fewshot_description(self):
return "Indicate if both sentences mean the same thing."
def doc_to_text(self, doc):
return "Sentence 1: {}\nSentence 2: {}\nQuestion: Do both sentences mean the same thing?\nAnswer:".format(
general_detokenize(doc["sentence1"]),
......@@ -395,9 +385,6 @@ class QQP(HFTask):
def has_test_docs(self):
return False
def fewshot_description(self):
return "Indicate if both questions ask the same thing."
def doc_to_text(self, doc):
return "Question 1: {}\nQuestion 2: {}\nQuestion: Do both questions ask the same thing?\nAnswer:".format(
doc["question1"],
......@@ -448,10 +435,6 @@ class STSB(HFTask):
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if both sentences mean the same thing from a scale of 0-5, " \
"where 5 means identical and 0 means unrelated."
def doc_to_text(self, doc):
return "sentence 1: {}\nsentence 2: {}\nAnswer:".format(
doc["sentence1"],
......
......@@ -25,9 +25,5 @@ class HeadQA(HFTask, MultipleChoiceTask):
}
return out_doc
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc):
return doc["query"]
......@@ -35,10 +35,5 @@ class HellaSwag(HFTask, MultipleChoiceTask):
}
return out_doc
def fewshot_description(self):
return "Label for the relevant action: Sentences describing the " \
"context, with an incomplete sentence trailing\nanswer that " \
"plausibly completes the situation."
def doc_to_text(self, doc):
return doc["query"]
......@@ -55,9 +55,6 @@ class Math(Task):
def test_docs(self):
return self._load_docs(self.DATASET_PATH / "test" / self.get_file_info())
def fewshot_description(self):
return "Given a mathematics problem, determine the answer. Simplify your answer as much as possible."
def doc_to_text(self, doc):
return "Problem: " + doc["problem"] + "\nAnswer:"
......
......@@ -47,10 +47,6 @@ class LAMBADA(Task):
def doc_to_target(self, doc):
return " " + doc['text'].rsplit(' ', 1)[1]
def fewshot_description(self):
# TODO: figure out description
return ""
def construct_requests(self, doc, ctx):
ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
......
......@@ -13,6 +13,3 @@ class LAMBADA_cloze(LAMBADA):
def doc_to_target(self, doc):
return " " + doc['text'].rsplit(' ', 1)[1]
def fewshot_description(self):
return "Fill in blank:\n"
......@@ -80,9 +80,5 @@ class LogiQA(MultipleChoiceTask):
def test_docs(self):
return self._load_docs(self.DATASET_PATH / "Test.txt")
def fewshot_description(self):
# TODO: figure out actual description
return ""
def doc_to_text(self, doc):
return doc["query"]
......@@ -29,9 +29,5 @@ class MathQA(HFTask, MultipleChoiceTask):
}
return out_doc
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc):
return doc["query"]
......@@ -39,9 +39,6 @@ class MCTACO(HFTask):
def has_test_docs(self):
return True
def fewshot_description(self):
return "Determine whether the candidate answer is plausible (\"yes\") or not (\"no\")"
def doc_to_text(self, doc):
return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
f"Answer: {doc['answer']}\nPlausible:"
......
......@@ -70,10 +70,6 @@ class MuTualBase(Task):
def test_docs(self):
return NotImplemented
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def doc_to_text(self, doc):
return self.detokenize(doc["article"])
......
......@@ -21,10 +21,6 @@ class NaturalQs(HFTask):
def has_test_docs(self):
return False
def fewshot_description(self):
# TODO: figure out description
return ""
def training_docs(self):
# Cache training for faster few-shot.
# Data is too large to fit in memory.
......
......@@ -25,9 +25,5 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
}
return out_doc
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def doc_to_text(self, doc):
return doc["query"]
......@@ -18,10 +18,6 @@ class PiQA(HFTask, MultipleChoiceTask):
def has_test_docs(self):
return False
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def _convert_standard(self, doc):
out_doc = {
"goal": doc["goal"],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment