Commit cf80f340 authored by Jason Phang's avatar Jason Phang
Browse files

glue tasks

parent c2aaa501
......@@ -103,14 +103,16 @@ class Dataset(abc.ABC):
"""
pass
def fewshot_prefix(self):
def fewshot_description(self):
return ""
def fewshot_context(self, doc, k):
prefix = self.fewshot_prefix()
labeled_examples = "\n\n".join([self.doc_to_text(doc) for doc in self.fewshot_examples(k)])
example = self.doc_to_text(doc, include_target=False)
return prefix + labeled_examples + example
def fewshot_context(self, doc, num_fewshot, provide_description):
description = (self.fewshot_description() + "\n\n") if provide_description else ""
labeled_examples = "\n\n".join(
map(self.doc_to_text, self.fewshot_examples(k=num_fewshot))
) + "\n\n"
example = self.doc_to_text(doc, include_target=False).strip()
return description + labeled_examples + example
class Registry:
......
......@@ -23,7 +23,7 @@ class BoolQ(base.Dataset):
def test_docs(self):
return []
def fewshot_description(self):
def fewshot_prefix(self):
return "Read the following passages and answer each question with a yes or a no."
def doc_to_text(self, doc, include_target=True):
......
from models.gpt2 import GPT2LM
import argparse
import json
lm = GPT2LM()
import models
import tasks
print(lm.generate('1 + 1 = 2.\n3 + 5 = 8.\n4 + 9 = 13.\n4 + 3 = 7.\n2 + 3 =', '.'))
\ No newline at end of file
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True)
parser.add_argument('--model_args', default="")
parser.add_argument('--tasks', default="all_tasks")
parser.add_argument('--provide_description', action="store_true")
parser.add_argument('--new_fewshot', action="store_true")
return parser.parse_args()
def main():
args = parse_args()
model = models.get_model(args.model).create_from_arg_string(args.model_args)
if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS
else:
task_names = args.tasks.split(",")
task_list = {
task_name: tasks.get_task(task_name)()
for task_name in task_names
}
results = {}
for task_name, task in task_list:
if not task.has_validation_docs():
continue
result = task.evaluate(
docs=task.validation_docs(),
provide_description=args.provide_description,
num_fewshot=args.new_fewshot,
)
results[task_name] = result
print(json.dumps(results, indent=2))
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -16,5 +16,8 @@ for file in os.listdir(tasks_dir):
module = importlib.import_module('lm_evaluation_harness.tasks.' + module_name)
ALL_TASKS = sorted(list(TASK_REGISTRY.registry))
def get_task(model_name):
return TASK_REGISTRY.registry[model_name]
......@@ -31,3 +31,10 @@ def simple_accuracy_metric(preds, golds):
"minor": {"acc": acc},
"higher_is_better": True,
}
def yesno(x):
if x:
return 'yes'
else:
return 'no'
import nlp
import numpy as np
import random
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from . common import NLP_TASK, simple_accuracy_metric
from tqdm import auto as tqdm_lib
from . common import NLP_TASK, simple_accuracy_metric, yesno
from . import TASK_REGISTRY
def get_accuracy_and_f1(preds, golds):
golds = np.array(golds)
preds = np.array(preds)
acc = float((preds == golds).mean())
f1 = float(f1_score(y_true=golds, y_pred=preds))
minor = {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}
return {
"major": minor["acc_and_f1"],
"minor": minor,
"higher_is_better": True,
}
@TASK_REGISTRY.register("cola")
class CoLA(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "cola"
def has_training_docs(self):
return True
......@@ -17,27 +37,25 @@ class CoLA(NLP_TASK):
def has_test_docs(self):
return True
def fewshot_description(self):
return "Does this sentence make sense?:\tTrue or False?"
def doc_to_text(self, doc, include_target=True):
text = "Does this sentence make sense?:\tTrue or False?" \
"\nsentence:{}\nAnswer: ".format(doc["sentence"])
text = "\nSentence:{}\nAnswer: ".format(doc["sentence"])
if include_target:
text += " {}".format({1: "True", 0: "False"}[doc["label"]])
return text
def evaluate(self, docs, lm, k=0):
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in docs:
word = lm.generate(
context=self.fewshot_context(doc=doc, k=k),
max_gen_length=1,
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
if word.strip() == "True":
preds.append(1)
elif word.strip() == "False":
preds.append(0)
else:
preds.append(-1)
preds.append(lm.loglikelihood(ctx, ' True') > lm.loglikelihood(ctx, ' False'))
golds = np.array(golds)
preds = np.array(preds)
mcc = float(matthews_corrcoef(y_true=golds, y_pred=preds))
......@@ -50,6 +68,9 @@ class CoLA(NLP_TASK):
@TASK_REGISTRY.register("mnli")
class MNLI(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "mnli"
def has_training_docs(self):
return True
......@@ -69,8 +90,8 @@ class MNLI(NLP_TASK):
def doc_to_text(self, doc, include_target=True):
text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
doc["sentence1"],
doc["sentence2"],
doc["premise"],
doc["hypothesis"],
)
if include_target:
# True = entailment
......@@ -79,26 +100,65 @@ class MNLI(NLP_TASK):
text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
return text
def evaluate(self, docs, lm, k=0):
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in docs:
word = lm.generate(
context=self.fewshot_context(doc=doc, k=k),
max_gen_length=1,
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
if word.strip() == "True":
preds.append(1)
elif word.strip() == "False":
preds.append(0)
else:
preds.append(-1)
probs = np.array([
self.lm.loglikelihood(ctx, ' True'),
self.lm.loglikelihood(ctx, ' Neither'),
self.lm.loglikelihood(ctx, ' False'),
])
preds.append(np.argmax(probs))
return simple_accuracy_metric(preds=preds, golds=golds)
@TASK_REGISTRY.register("mrpc")
class MRPC(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "mrpc"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if both sentences mean the same thing."
def doc_to_text(self, doc, include_target=True):
text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
doc["sentence1"],
doc["sentence2"],
)
if include_target:
text += " {}".format(yesno(doc["label"]))
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
return get_accuracy_and_f1(preds=preds, golds=golds)
@TASK_REGISTRY.register("rte")
class RTE(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "rte"
......@@ -120,18 +180,230 @@ class RTE(NLP_TASK):
text += " {}".format({1: "True", 0: "False"}[doc["label"]])
return text
def evaluate(self, docs, lm, k=0):
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' True') > lm.loglikelihood(ctx, ' False'))
return simple_accuracy_metric(preds=preds, golds=golds)
@TASK_REGISTRY.register("qnli")
class QNLI(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "qnli"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def doc_to_text(self, doc, include_target=True):
text = "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
doc["question"],
doc["sentence"],
)
if include_target:
# True = entailment
# False = not entailment
text += " {}".format({0: "True", 1: "False"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(self.lm.loglikelihood(ctx, ' False') > self.lm.loglikelihood(ctx, ' True'))
return simple_accuracy_metric(preds=preds, golds=golds)
@TASK_REGISTRY.register("qqp")
class QQP(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "qqp"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if both sentences mean the same thing."
def doc_to_text(self, doc, include_target=True):
text = "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
doc["question1"],
doc["question2"],
)
if include_target:
text += " {}".format(yesno(doc["label"]))
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
return get_accuracy_and_f1(preds=preds, golds=golds)
@TASK_REGISTRY.register("stsb")
class STSB(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "stsb"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if both sentences mean the same thing from a scale of 0-5, " \
"where 5 means identical and 0 means unrelated."
def doc_to_text(self, doc, include_target=True):
text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
doc["sentence1"],
doc["sentence2"],
)
if include_target:
text += " {}".format(yesno(doc["label"]))
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in docs:
word = lm.generate(
context=self.fewshot_context(doc=doc, k=k),
max_gen_length=1,
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
if word.strip() == "True":
preds.append(1)
elif word.strip() == "False":
preds.append(0)
output = lm.generate(context=ctx, max_gen_length=5).strip()
first_element = output.split()[0]
if first_element.isnumeric():
pred = max(min(float(first_element), 5.0), 0.0)
else:
preds.append(-1)
pred = 2.5
preds.append(pred)
pearson_corr = float(pearsonr(preds, golds)[0])
spearman_corr = float(spearmanr(preds, golds)[0])
minor = {
"pearson": pearson_corr,
"spearmanr": spearman_corr,
"corr": (pearson_corr + spearman_corr) / 2,
}
return {
"major": minor["corr"],
"minor": minor,
"higher_is_better": True,
}
@TASK_REGISTRY.register("sst")
class SST(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "sst2"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if each sentence is Positive or Negative."
def doc_to_text(self, doc, include_target=True):
text = "sentence:\t{}\t\nanswer:".format(
doc["sentence"],
)
if include_target:
text += " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' Positive') > lm.loglikelihood(ctx, ' Negative'))
return simple_accuracy_metric(preds=preds, golds=golds)
@TASK_REGISTRY.register("wnli")
class WNLI(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "wnli"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def doc_to_text(self, doc, include_target=True):
text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
doc["premise"],
doc["hypothesis"],
)
if include_target:
# True = entailment
# False = contradiction
# Neither = neutral
text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
probs = np.array([
self.lm.loglikelihood(ctx, ' True'),
self.lm.loglikelihood(ctx, ' Neither'),
self.lm.loglikelihood(ctx, ' False'),
])
preds.append(np.argmax(probs))
return simple_accuracy_metric(preds=preds, golds=golds)
from . common import NLP_TASK, simple_accuracy_metric, yesno
from . import TASK_REGISTRY
@TASK_REGISTRY.register("boolq")
class BoolQ(NLP_TASK):
NLP_PATH = "superglue"
NLP_NAME = "boolq"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Read the following passages and answer each question with a yes or a no."
def doc_to_text(self, doc, include_target=True):
return f"{doc['passage']}\nquestion: {doc['question']}\nanswer: " \
+ (yesno(doc['answer']) if include_target else "")
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["answer"] for doc in docs]
preds = []
for doc in docs:
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
return simple_accuracy_metric(preds=preds, golds=golds)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment