Commit 36467c0e authored by Jonathan Tow's avatar Jonathan Tow
Browse files

Adopt new framework for `glue`

parent 4c8d22db
import abc import abc
import random import random
import collections
import numpy as np import numpy as np
from sklearn.metrics import precision_recall_fscore_support as score import sklearn
class LM(abc.ABC): class LM(abc.ABC):
@abc.abstractmethod @abc.abstractmethod
...@@ -177,15 +176,23 @@ class Dataset(abc.ABC): ...@@ -177,15 +176,23 @@ class Dataset(abc.ABC):
return description + labeled_examples + example return description + labeled_examples + example
def mean(arr): def mean(arr):
return sum(arr) / len(arr) return sum(arr) / len(arr)
def median(arr):
return arr[len(arr) // 2]
def matthews_corrcoef(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
return sklearn.metrics.matthews_corrcoef(golds, preds)
def f1_score(items): def f1_score(items):
unzipped_list = list(zip(*items)) unzipped_list = list(zip(*items))
golds = unzipped_list[0] golds = unzipped_list[0]
preds = unzipped_list[1] preds = unzipped_list[1]
precision, recall, fscore, support = score(golds, preds) fscore = sklearn.metrics.f1_score(golds, preds)
return max(fscore) return max(fscore)
def acc_all(items): def acc_all(items):
...@@ -205,9 +212,6 @@ def acc_all(items): ...@@ -205,9 +212,6 @@ def acc_all(items):
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()]) acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc return acc
def median(arr):
return arr[len(arr) // 2]
req_ret_lens = { req_ret_lens = {
'loglikelihood': 2 'loglikelihood': 2
} }
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted. # REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import numpy as np import numpy as np
from lm_eval.base import rf, mean, f1_score, matthews_corrcoef
from scipy.stats import pearsonr, spearmanr from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno from . common import HFTask, yesno
def get_accuracy_and_f1(preds, golds):
golds = np.array(golds) # Single-Sentence Tasks
preds = np.array(preds)
acc = float((preds == golds).mean())
f1 = float(f1_score(y_true=golds, y_pred=preds))
minor = {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}
return {
"major": minor["acc_and_f1"],
"minor": minor,
"higher_is_better": True,
}
class CoLA(HFTask): class CoLA(HFTask):
...@@ -45,31 +32,80 @@ class CoLA(HFTask): ...@@ -45,31 +32,80 @@ class CoLA(HFTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " {}".format({1: "True", 0: "False"}[doc["label"]]) return " {}".format({1: "True", 0: "False"}[doc["label"]])
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
ll_true, _ = rf.loglikelihood(ctx, " True")
ll_false, _ = rf.loglikelihood(ctx, " False")
return ll_true, ll_false
# TODO: Implement evaluation code using new framework def process_results(self, doc, results):
ll_true, ll_false = results
pred = ll_true > ll_false
gold = doc["label"]
return {
"mcc": (gold, pred)
}
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. def higher_is_better(self):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. return {
# Remove this comment when the evaluation code is implemented. "mcc": True
golds = [doc["label"] for doc in docs] }
preds = []
for doc in tqdm_lib.tqdm(docs): def aggregation(self):
ctx = self.fewshot_context( return {
doc=doc, "mcc": matthews_corrcoef
provide_description=provide_description, }
num_fewshot=num_fewshot,
class SST(HFTask):
DATASET_PATH = "glue"
DATASET_NAME = "sst2"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if each sentence is Positive or Negative."
def doc_to_text(self, doc):
return "sentence:\t{}\t\nanswer:".format(
doc["sentence"],
) )
preds.append(lm.loglikelihood(ctx, ' True') > lm.loglikelihood(ctx, ' False'))
golds = np.array(golds) def doc_to_target(self, doc):
preds = np.array(preds) return " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
mcc = float(matthews_corrcoef(y_true=golds, y_pred=preds))
def construct_requests(self, doc, ctx):
ll_positive, _ = rf.loglikelihood(ctx, " Positive")
ll_negative, _ = rf.loglikelihood(ctx, " Negative")
return ll_positive, ll_negative
def process_results(self, doc, results):
ll_positive, ll_negative = results
pred = ll_positive > ll_negative
gold = doc["label"]
return { return {
"major": mcc, "acc": pred == gold
"minor": {"mcc": mcc}, }
"higher_is_better": True,
def higher_is_better(self):
return {
"acc": True
} }
def aggregation(self):
return {
"acc": mean
}
# Inference Tasks
class MNLI(HFTask): class MNLI(HFTask):
DATASET_PATH = "glue" DATASET_PATH = "glue"
...@@ -104,27 +140,28 @@ class MNLI(HFTask): ...@@ -104,27 +140,28 @@ class MNLI(HFTask):
# Neither = neutral # Neither = neutral
return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]]) return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: Implement evaluation code using new framework ll_true, _ = rf.loglikelihood(ctx, " True")
ll_neither, _ = rf.loglikelihood(ctx, " Neither")
ll_false, _ = rf.loglikelihood(ctx, " False")
return ll_true, ll_neither, ll_false
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. def process_results(self, doc, results):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. gold = doc["label"]
# Remove this comment when the evaluation code is implemented. pred = np.argmax(results)
golds = [doc["label"] for doc in docs] return {
preds = [] "acc": pred == gold
for doc in tqdm_lib.tqdm(docs): }
ctx = self.fewshot_context(
doc=doc, def higher_is_better(self):
provide_description=provide_description, return {
num_fewshot=num_fewshot, "acc": True
) }
probs = np.array([
lm.loglikelihood(ctx, ' True'), def aggregation(self):
lm.loglikelihood(ctx, ' Neither'), return {
lm.loglikelihood(ctx, ' False'), "acc": mean
]) }
preds.append(np.argmax(probs))
return simple_accuracy_metric(preds=preds, golds=golds)
class MNLIMismatched(MNLI): class MNLIMismatched(MNLI):
...@@ -138,9 +175,9 @@ class MNLIMismatched(MNLI): ...@@ -138,9 +175,9 @@ class MNLIMismatched(MNLI):
return self.data["test_mismatched"] return self.data["test_mismatched"]
class MRPC(HFTask): class QNLI(HFTask):
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "mrpc" DATASET_NAME = "qnli"
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -151,34 +188,88 @@ class MRPC(HFTask): ...@@ -151,34 +188,88 @@ class MRPC(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_description(self): def doc_to_text(self, doc):
return "Indicate if both sentences mean the same thing." return "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format(
doc["question"],
doc["sentence"],
)
def doc_to_target(self, doc):
# True = entailment
# False = not entailment
return " {}".format({0: "Yes", 1: "No"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, " Yes")
ll_no, _ = rf.loglikelihood(ctx, " No")
return ll_yes, ll_no
def process_results(self, doc, results):
ll_yes, ll_no = results
pred = ll_no > ll_yes
gold = doc["label"]
return {
"acc": pred == gold
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
class WNLI(HFTask):
DATASET_PATH = "glue"
DATASET_NAME = "wnli"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format( return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
doc["sentence1"], doc["sentence1"],
doc["sentence2"], doc["sentence2"],
) )
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " {}".format(yesno(doc["label"])) # True = entailment
# False = contradiction
# Neither = neutral
return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: Implement evaluation code using new framework ll_true, _ = rf.loglikelihood(ctx, " True")
ll_neither, _ = rf.loglikelihood(ctx, " Neither")
ll_false, _ = rf.loglikelihood(ctx, " False")
return ll_true, ll_neither, ll_false
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. def process_results(self, doc, results):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. gold = doc["label"]
# Remove this comment when the evaluation code is implemented. pred = np.argmax(results)
golds = [doc["label"] for doc in docs] return {
preds = [] "acc": pred == gold
for doc in tqdm_lib.tqdm(docs): }
ctx = self.fewshot_context(
doc=doc, def higher_is_better(self):
provide_description=provide_description, return {
num_fewshot=num_fewshot, "acc": True
) }
preds.append(lm.loglikelihood(ctx, 'yes') > lm.loglikelihood(ctx, 'no'))
return get_accuracy_and_f1(preds=preds, golds=golds) def aggregation(self):
return {
"acc": mean
}
class RTE(HFTask): class RTE(HFTask):
...@@ -205,27 +296,36 @@ class RTE(HFTask): ...@@ -205,27 +296,36 @@ class RTE(HFTask):
# 1 = not_entailment # 1 = not_entailment
return " {}".format({0: "True", 1: "False"}[doc["label"]]) return " {}".format({0: "True", 1: "False"}[doc["label"]])
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: Implement evaluation code using new framework ll_true, _ = rf.loglikelihood(ctx, " True")
ll_false, _ = rf.loglikelihood(ctx, " False")
return ll_true, ll_false
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. def process_results(self, doc, results):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. ll_true, ll_false = results
# Remove this comment when the evaluation code is implemented. pred = ll_false > ll_true
golds = [doc["label"] for doc in docs] gold = doc["label"]
preds = [] return {
for doc in tqdm_lib.tqdm(docs): "acc": pred == gold
ctx = self.fewshot_context( }
doc=doc,
provide_description=provide_description, def higher_is_better(self):
num_fewshot=num_fewshot, return {
) "acc": True
preds.append(lm.loglikelihood(ctx, ' False') > lm.loglikelihood(ctx, ' True')) }
return simple_accuracy_metric(preds=preds, golds=golds)
def aggregation(self):
return {
"acc": mean
}
class QNLI(HFTask):
# Similarity and Paraphrase Tasks
class MRPC(HFTask):
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "qnli" DATASET_NAME = "mrpc"
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -236,33 +336,43 @@ class QNLI(HFTask): ...@@ -236,33 +336,43 @@ class QNLI(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_description(self):
return "Indicate if both sentences mean the same thing."
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format( return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
doc["question"], doc["sentence1"],
doc["sentence"], doc["sentence2"],
) )
def doc_to_target(self, doc): def doc_to_target(self, doc):
# True = entailment return " {}".format(yesno(doc["label"]))
# False = not entailment
return " {}".format({0: "Yes", 1: "No"}[doc["label"]])
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: Implement evaluation code using new framework ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. def process_results(self, doc, results):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. ll_yes, ll_no = results
# Remove this comment when the evaluation code is implemented. gold = doc["label"]
golds = [doc["label"] for doc in docs] pred = ll_yes > ll_no
preds = [] return {
for doc in tqdm_lib.tqdm(docs): "acc": pred == gold,
ctx = self.fewshot_context( "f1": (gold, pred),
doc=doc, }
provide_description=provide_description,
num_fewshot=num_fewshot, def higher_is_better(self):
) return {
preds.append(lm.loglikelihood(ctx, ' False') > lm.loglikelihood(ctx, ' True')) "acc": True,
return simple_accuracy_metric(preds=preds, golds=golds) "f1": True
}
def aggregation(self):
return {
"acc": mean,
"f1": f1_score
}
class QQP(HFTask): class QQP(HFTask):
...@@ -290,22 +400,31 @@ class QQP(HFTask): ...@@ -290,22 +400,31 @@ class QQP(HFTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " {}".format(yesno(doc["label"])) return " {}".format(yesno(doc["label"]))
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: Implement evaluation code using new framework ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. def process_results(self, doc, results):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. ll_yes, ll_no = results
# Remove this comment when the evaluation code is implemented. gold = doc["label"]
golds = [doc["label"] for doc in docs] pred = ll_yes > ll_no
preds = [] return {
for doc in tqdm_lib.tqdm(docs): "acc": pred == gold,
ctx = self.fewshot_context( "f1": (gold, pred),
doc=doc, }
provide_description=provide_description,
num_fewshot=num_fewshot, def higher_is_better(self):
) return {
preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no')) "acc": True,
return get_accuracy_and_f1(preds=preds, golds=golds) "f1": True
}
def aggregation(self):
return {
"acc": mean,
"f1": f1_score
}
class STSB(HFTask): class STSB(HFTask):
...@@ -368,93 +487,3 @@ class STSB(HFTask): ...@@ -368,93 +487,3 @@ class STSB(HFTask):
"minor": minor, "minor": minor,
"higher_is_better": True, "higher_is_better": True,
} }
class SST(HFTask):
DATASET_PATH = "glue"
DATASET_NAME = "sst2"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if each sentence is Positive or Negative."
def doc_to_text(self, doc):
return "sentence:\t{}\t\nanswer:".format(
doc["sentence"],
)
def doc_to_target(self, doc):
return " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' Positive') > lm.loglikelihood(ctx, ' Negative'))
return simple_accuracy_metric(preds=preds, golds=golds)
class WNLI(HFTask):
DATASET_PATH = "glue"
DATASET_NAME = "wnli"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def doc_to_text(self, doc):
return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
doc["sentence1"],
doc["sentence2"],
)
def doc_to_target(self, doc):
# True = entailment
# False = contradiction
# Neither = neutral
return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
probs = np.array([
lm.loglikelihood(ctx, ' True'),
lm.loglikelihood(ctx, ' Neither'),
lm.loglikelihood(ctx, ' False'),
])
preds.append(np.argmax(probs))
return simple_accuracy_metric(preds=preds, golds=golds)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment