".github/workflows/cuda/cu113-Windows.sh" did not exist on "eb43b46457e446ac7aba3ec9d1a74fd5c9d83048"
Commit cf80f340 authored by Jason Phang's avatar Jason Phang
Browse files

glue tasks

parent c2aaa501
...@@ -103,14 +103,16 @@ class Dataset(abc.ABC): ...@@ -103,14 +103,16 @@ class Dataset(abc.ABC):
""" """
pass pass
def fewshot_prefix(self): def fewshot_description(self):
return "" return ""
def fewshot_context(self, doc, k): def fewshot_context(self, doc, num_fewshot, provide_description):
prefix = self.fewshot_prefix() description = (self.fewshot_description() + "\n\n") if provide_description else ""
labeled_examples = "\n\n".join([self.doc_to_text(doc) for doc in self.fewshot_examples(k)]) labeled_examples = "\n\n".join(
example = self.doc_to_text(doc, include_target=False) map(self.doc_to_text, self.fewshot_examples(k=num_fewshot))
return prefix + labeled_examples + example ) + "\n\n"
example = self.doc_to_text(doc, include_target=False).strip()
return description + labeled_examples + example
class Registry: class Registry:
......
...@@ -23,7 +23,7 @@ class BoolQ(base.Dataset): ...@@ -23,7 +23,7 @@ class BoolQ(base.Dataset):
def test_docs(self): def test_docs(self):
return [] return []
def fewshot_description(self): def fewshot_prefix(self):
return "Read the following passages and answer each question with a yes or a no." return "Read the following passages and answer each question with a yes or a no."
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc, include_target=True):
......
from models.gpt2 import GPT2LM import argparse
import json
lm = GPT2LM() import models
import tasks
print(lm.generate('1 + 1 = 2.\n3 + 5 = 8.\n4 + 9 = 13.\n4 + 3 = 7.\n2 + 3 =', '.')) def parse_args():
\ No newline at end of file parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True)
parser.add_argument('--model_args', default="")
parser.add_argument('--tasks', default="all_tasks")
parser.add_argument('--provide_description', action="store_true")
parser.add_argument('--new_fewshot', action="store_true")
return parser.parse_args()
def main():
args = parse_args()
model = models.get_model(args.model).create_from_arg_string(args.model_args)
if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS
else:
task_names = args.tasks.split(",")
task_list = {
task_name: tasks.get_task(task_name)()
for task_name in task_names
}
results = {}
for task_name, task in task_list:
if not task.has_validation_docs():
continue
result = task.evaluate(
docs=task.validation_docs(),
provide_description=args.provide_description,
num_fewshot=args.new_fewshot,
)
results[task_name] = result
print(json.dumps(results, indent=2))
if __name__ == "__main__":
main()
\ No newline at end of file
...@@ -16,5 +16,8 @@ for file in os.listdir(tasks_dir): ...@@ -16,5 +16,8 @@ for file in os.listdir(tasks_dir):
module = importlib.import_module('lm_evaluation_harness.tasks.' + module_name) module = importlib.import_module('lm_evaluation_harness.tasks.' + module_name)
ALL_TASKS = sorted(list(TASK_REGISTRY.registry))
def get_task(model_name): def get_task(model_name):
return TASK_REGISTRY.registry[model_name] return TASK_REGISTRY.registry[model_name]
...@@ -31,3 +31,10 @@ def simple_accuracy_metric(preds, golds): ...@@ -31,3 +31,10 @@ def simple_accuracy_metric(preds, golds):
"minor": {"acc": acc}, "minor": {"acc": acc},
"higher_is_better": True, "higher_is_better": True,
} }
def yesno(x):
if x:
return 'yes'
else:
return 'no'
import nlp
import numpy as np import numpy as np
import random from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef from sklearn.metrics import f1_score, matthews_corrcoef
from . common import NLP_TASK, simple_accuracy_metric from tqdm import auto as tqdm_lib
from . common import NLP_TASK, simple_accuracy_metric, yesno
from . import TASK_REGISTRY from . import TASK_REGISTRY
def get_accuracy_and_f1(preds, golds):
golds = np.array(golds)
preds = np.array(preds)
acc = float((preds == golds).mean())
f1 = float(f1_score(y_true=golds, y_pred=preds))
minor = {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}
return {
"major": minor["acc_and_f1"],
"minor": minor,
"higher_is_better": True,
}
@TASK_REGISTRY.register("cola") @TASK_REGISTRY.register("cola")
class CoLA(NLP_TASK): class CoLA(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "cola"
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -17,27 +37,25 @@ class CoLA(NLP_TASK): ...@@ -17,27 +37,25 @@ class CoLA(NLP_TASK):
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_description(self):
return "Does this sentence make sense?:\tTrue or False?"
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc, include_target=True):
text = "Does this sentence make sense?:\tTrue or False?" \ text = "\nSentence:{}\nAnswer: ".format(doc["sentence"])
"\nsentence:{}\nAnswer: ".format(doc["sentence"])
if include_target: if include_target:
text += " {}".format({1: "True", 0: "False"}[doc["label"]]) text += " {}".format({1: "True", 0: "False"}[doc["label"]])
return text return text
def evaluate(self, docs, lm, k=0): def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in docs: for doc in tqdm_lib.tqdm(docs):
word = lm.generate( ctx = self.fewshot_context(
context=self.fewshot_context(doc=doc, k=k), doc=doc,
max_gen_length=1, provide_description=provide_description,
) num_fewshot=num_fewshot,
if word.strip() == "True": )
preds.append(1) preds.append(lm.loglikelihood(ctx, ' True') > lm.loglikelihood(ctx, ' False'))
elif word.strip() == "False":
preds.append(0)
else:
preds.append(-1)
golds = np.array(golds) golds = np.array(golds)
preds = np.array(preds) preds = np.array(preds)
mcc = float(matthews_corrcoef(y_true=golds, y_pred=preds)) mcc = float(matthews_corrcoef(y_true=golds, y_pred=preds))
...@@ -50,6 +68,9 @@ class CoLA(NLP_TASK): ...@@ -50,6 +68,9 @@ class CoLA(NLP_TASK):
@TASK_REGISTRY.register("mnli") @TASK_REGISTRY.register("mnli")
class MNLI(NLP_TASK): class MNLI(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "mnli"
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -69,8 +90,8 @@ class MNLI(NLP_TASK): ...@@ -69,8 +90,8 @@ class MNLI(NLP_TASK):
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc, include_target=True):
text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format( text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
doc["sentence1"], doc["premise"],
doc["sentence2"], doc["hypothesis"],
) )
if include_target: if include_target:
# True = entailment # True = entailment
...@@ -79,26 +100,65 @@ class MNLI(NLP_TASK): ...@@ -79,26 +100,65 @@ class MNLI(NLP_TASK):
text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]]) text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
return text return text
def evaluate(self, docs, lm, k=0): def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in docs: for doc in tqdm_lib.tqdm(docs):
word = lm.generate( ctx = self.fewshot_context(
context=self.fewshot_context(doc=doc, k=k), doc=doc,
max_gen_length=1, provide_description=provide_description,
) num_fewshot=num_fewshot,
if word.strip() == "True": )
preds.append(1) probs = np.array([
elif word.strip() == "False": self.lm.loglikelihood(ctx, ' True'),
preds.append(0) self.lm.loglikelihood(ctx, ' Neither'),
else: self.lm.loglikelihood(ctx, ' False'),
preds.append(-1) ])
preds.append(np.argmax(probs))
return simple_accuracy_metric(preds=preds, golds=golds) return simple_accuracy_metric(preds=preds, golds=golds)
@TASK_REGISTRY.register("mrpc")
class MRPC(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "mrpc"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if both sentences mean the same thing."
def doc_to_text(self, doc, include_target=True):
text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
doc["sentence1"],
doc["sentence2"],
)
if include_target:
text += " {}".format(yesno(doc["label"]))
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
return get_accuracy_and_f1(preds=preds, golds=golds)
@TASK_REGISTRY.register("rte") @TASK_REGISTRY.register("rte")
class RTE(NLP_TASK): class RTE(NLP_TASK):
NLP_PATH = "glue" NLP_PATH = "glue"
NLP_NAME = "rte" NLP_NAME = "rte"
...@@ -120,18 +180,230 @@ class RTE(NLP_TASK): ...@@ -120,18 +180,230 @@ class RTE(NLP_TASK):
text += " {}".format({1: "True", 0: "False"}[doc["label"]]) text += " {}".format({1: "True", 0: "False"}[doc["label"]])
return text return text
def evaluate(self, docs, lm, k=0): def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' True') > lm.loglikelihood(ctx, ' False'))
return simple_accuracy_metric(preds=preds, golds=golds)
@TASK_REGISTRY.register("qnli")
class QNLI(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "qnli"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def doc_to_text(self, doc, include_target=True):
text = "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
doc["question"],
doc["sentence"],
)
if include_target:
# True = entailment
# False = not entailment
text += " {}".format({0: "True", 1: "False"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs] golds = [doc["label"] for doc in docs]
preds = [] preds = []
for doc in docs: for doc in tqdm_lib.tqdm(docs):
word = lm.generate( ctx = self.fewshot_context(
context=self.fewshot_context(doc=doc, k=k), doc=doc,
max_gen_length=1, provide_description=provide_description,
) num_fewshot=num_fewshot,
if word.strip() == "True": )
preds.append(1) preds.append(self.lm.loglikelihood(ctx, ' False') > self.lm.loglikelihood(ctx, ' True'))
elif word.strip() == "False": return simple_accuracy_metric(preds=preds, golds=golds)
preds.append(0)
@TASK_REGISTRY.register("qqp")
class QQP(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "qqp"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if both sentences mean the same thing."
def doc_to_text(self, doc, include_target=True):
text = "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
doc["question1"],
doc["question2"],
)
if include_target:
text += " {}".format(yesno(doc["label"]))
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
return get_accuracy_and_f1(preds=preds, golds=golds)
@TASK_REGISTRY.register("stsb")
class STSB(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "stsb"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if both sentences mean the same thing from a scale of 0-5, " \
"where 5 means identical and 0 means unrelated."
def doc_to_text(self, doc, include_target=True):
text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
doc["sentence1"],
doc["sentence2"],
)
if include_target:
text += " {}".format(yesno(doc["label"]))
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
output = lm.generate(context=ctx, max_gen_length=5).strip()
first_element = output.split()[0]
if first_element.isnumeric():
pred = max(min(float(first_element), 5.0), 0.0)
else: else:
preds.append(-1) pred = 2.5
preds.append(pred)
pearson_corr = float(pearsonr(preds, golds)[0])
spearman_corr = float(spearmanr(preds, golds)[0])
minor = {
"pearson": pearson_corr,
"spearmanr": spearman_corr,
"corr": (pearson_corr + spearman_corr) / 2,
}
return {
"major": minor["corr"],
"minor": minor,
"higher_is_better": True,
}
@TASK_REGISTRY.register("sst")
class SST(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "sst2"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if each sentence is Positive or Negative."
def doc_to_text(self, doc, include_target=True):
text = "sentence:\t{}\t\nanswer:".format(
doc["sentence"],
)
if include_target:
text += " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' Positive') > lm.loglikelihood(ctx, ' Negative'))
return simple_accuracy_metric(preds=preds, golds=golds)
@TASK_REGISTRY.register("wnli")
class WNLI(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "wnli"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def doc_to_text(self, doc, include_target=True):
text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
doc["premise"],
doc["hypothesis"],
)
if include_target:
# True = entailment
# False = contradiction
# Neither = neutral
text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
probs = np.array([
self.lm.loglikelihood(ctx, ' True'),
self.lm.loglikelihood(ctx, ' Neither'),
self.lm.loglikelihood(ctx, ' False'),
])
preds.append(np.argmax(probs))
return simple_accuracy_metric(preds=preds, golds=golds) return simple_accuracy_metric(preds=preds, golds=golds)
from . common import NLP_TASK, simple_accuracy_metric, yesno
from . import TASK_REGISTRY
@TASK_REGISTRY.register("boolq")
class BoolQ(NLP_TASK):
NLP_PATH = "superglue"
NLP_NAME = "boolq"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Read the following passages and answer each question with a yes or a no."
def doc_to_text(self, doc, include_target=True):
return f"{doc['passage']}\nquestion: {doc['question']}\nanswer: " \
+ (yesno(doc['answer']) if include_target else "")
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["answer"] for doc in docs]
preds = []
for doc in docs:
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
return simple_accuracy_metric(preds=preds, golds=golds)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment