Unverified Commit 4c8d22db authored by Leo Gao's avatar Leo Gao Committed by GitHub
Browse files

Merge pull request #85 from EleutherAI/sglue-eval-fazz

SuperGLUE commitmentbank implementation (new framework)
parents 0e4139b8 173b33a4
import abc import abc
import random import random
import collections import collections
import numpy as np
from sklearn.metrics import precision_recall_fscore_support as score
class LM(abc.ABC): class LM(abc.ABC):
@abc.abstractmethod @abc.abstractmethod
...@@ -180,6 +181,30 @@ class Dataset(abc.ABC): ...@@ -180,6 +181,30 @@ class Dataset(abc.ABC):
def mean(arr): def mean(arr):
return sum(arr) / len(arr) return sum(arr) / len(arr)
def f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
precision, recall, fscore, support = score(golds, preds)
return max(fscore)
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for (doc, pred) in zip(docs, preds):
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def median(arr): def median(arr):
return arr[len(arr) // 2] return arr[len(arr) // 2]
......
...@@ -27,7 +27,7 @@ TASK_REGISTRY = { ...@@ -27,7 +27,7 @@ TASK_REGISTRY = {
"wnli": glue.WNLI, "wnli": glue.WNLI,
# SuperGLUE # SuperGLUE
"boolq": superglue.BoolQ, "boolq": superglue.BoolQ,
"commitmentbank": superglue.CommitmentBank, "cb": superglue.CommitmentBank,
"copa": superglue.Copa, "copa": superglue.Copa,
"multirc": superglue.MultiRC, "multirc": superglue.MultiRC,
"wic": superglue.WordsInContext, "wic": superglue.WordsInContext,
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import numpy as np import numpy as np
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno from . common import HFTask, simple_accuracy_metric, yesno
from lm_eval.base import rf, mean from lm_eval.base import rf, mean, f1_score, acc_all
class BoolQ(HFTask): class BoolQ(HFTask):
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
...@@ -55,7 +55,6 @@ class BoolQ(HFTask): ...@@ -55,7 +55,6 @@ class BoolQ(HFTask):
"acc": mean "acc": mean
} }
class CommitmentBank(HFTask): class CommitmentBank(HFTask):
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
DATASET_NAME = "cb" DATASET_NAME = "cb"
...@@ -69,8 +68,11 @@ class CommitmentBank(HFTask): ...@@ -69,8 +68,11 @@ class CommitmentBank(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_description(self):
return "Given a premise and a hypothesis, classify whether the author of the premise is committed to the truth of the hypothesis. The three possible labels are true, false or neither."
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{}\nquestion:\t{}\ttrue, false or neither?\nanswer:".format( return "{}\nquestion: {} true, false or neither?\nanswer:".format(
doc["premise"], doc["premise"],
doc["hypothesis"], doc["hypothesis"],
) )
...@@ -81,28 +83,34 @@ class CommitmentBank(HFTask): ...@@ -81,28 +83,34 @@ class CommitmentBank(HFTask):
# Neither = neutral # Neither = neutral
return " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]]) return " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]])
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: Implement evaluation code using new framework ll_true, _ = rf.loglikelihood(ctx, ' true')
ll_neither, _ = rf.loglikelihood(ctx, ' neither')
ll_false, _ = rf.loglikelihood(ctx, ' false')
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. return ll_true, ll_neither, ll_false
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented. def process_results(self, doc, results):
golds = [doc["label"] for doc in docs] gold = doc["label"]
preds = [] pred = np.argmax(results)
for doc in tqdm_lib.tqdm(docs): acc = 1. if pred == gold else 0.
ctx = self.fewshot_context(
doc=doc, return {
provide_description=provide_description, "acc": acc,
num_fewshot=num_fewshot, "f1": (pred, gold)
) }
probs = np.array([
lm.loglikelihood(ctx, ' true'),
lm.loglikelihood(ctx, ' neither'),
lm.loglikelihood(ctx, ' false'),
])
preds.append(np.argmax(probs))
return simple_accuracy_metric(preds=preds, golds=golds)
def higher_is_better(self):
return {
"acc": True,
"f1": True
}
def aggregation(self):
return {
"acc": mean,
"f1": f1_score
}
class Copa(HFTask): class Copa(HFTask):
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
...@@ -117,6 +125,9 @@ class Copa(HFTask): ...@@ -117,6 +125,9 @@ class Copa(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_description(self):
return "Given a premise and one alternative with a causal relation to the premise and another without, choose the more plausible alternative"
def doc_to_text(self, doc): def doc_to_text(self, doc):
# Drop the period # Drop the period
connector = { connector = {
...@@ -130,24 +141,33 @@ class Copa(HFTask): ...@@ -130,24 +141,33 @@ class Copa(HFTask):
# Connect the sentences # Connect the sentences
return self.convert_choice(correct_choice) return self.convert_choice(correct_choice)
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
choice1 = " " + self.convert_choice(doc["choice1"]) choice1 = " " + self.convert_choice(doc["choice1"])
choice2 = " " + self.convert_choice(doc["choice2"]) choice2 = " " + self.convert_choice(doc["choice2"])
preds.append(lm.loglikelihood(ctx, choice2) > lm.loglikelihood(ctx, choice1))
return simple_accuracy_metric(preds=preds, golds=golds) ll_choice1, _ = rf.loglikelihood(ctx, choice1)
ll_choice2, _ = rf.loglikelihood(ctx, choice2)
return ll_choice1, ll_choice2
def process_results(self, doc, results):
gold = doc["label"]
pred = np.argmax(results)
acc = 1. if pred == gold else 0.
return {
"acc": acc
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
@staticmethod @staticmethod
def convert_choice(choice): def convert_choice(choice):
...@@ -181,41 +201,33 @@ class MultiRC(HFTask): ...@@ -181,41 +201,33 @@ class MultiRC(HFTask):
label_str = "True" if label else "False" label_str = "True" if label else "False"
return f"[{label_str}] {answer}" return f"[{label_str}] {answer}"
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
preds = []
for doc in docs:
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
true_choice = self.format_answer(answer=doc["answer"], label=True) true_choice = self.format_answer(answer=doc["answer"], label=True)
false_choice = self.format_answer(answer=doc["answer"], label=False) false_choice = self.format_answer(answer=doc["answer"], label=False)
preds.append(
lm.loglikelihood(ctx, f' {true_choice}')
> lm.loglikelihood(ctx, f' {false_choice}')
)
# Only count as correct if all answers are labeled correctly for each question ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
question_scoring_dict = {} ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"] return ll_true_choice, ll_false_choice
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = [] def process_results(self, doc, results):
gold_label = doc["label"] == 1 gold = doc["label"]
question_scoring_dict[question_id].append(gold_label == pred) pred = np.argmax(results)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()]) acc = 1. if pred == gold else 0.
return { return {
"major": acc, "acc": (pred, doc)
"minor": {"acc": acc},
"higher_is_better": True,
} }
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": acc_all
}
class WordsInContext(HFTask): class WordsInContext(HFTask):
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
...@@ -231,7 +243,7 @@ class WordsInContext(HFTask): ...@@ -231,7 +243,7 @@ class WordsInContext(HFTask):
return True return True
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{}\n{}\nquestion\tIs the word '{}' used in the same way in the" \ return "{}\n{}\nQuestion: Is the word '{}' used in the same way in the" \
" two sentences above?\nanswer:".format( " two sentences above?\nanswer:".format(
doc["sentence1"], doc["sentence1"],
doc["sentence2"], doc["sentence2"],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment