Commit 68a8790c authored by thefazzer's avatar thefazzer
Browse files

CommitmentBank framework refactor

parent e5d0229f
...@@ -27,7 +27,7 @@ TASK_REGISTRY = { ...@@ -27,7 +27,7 @@ TASK_REGISTRY = {
"wnli": glue.WNLI, "wnli": glue.WNLI,
# SuperGLUE # SuperGLUE
"boolq": superglue.BoolQ, "boolq": superglue.BoolQ,
"commitmentbank": superglue.CommitmentBank, "cb": superglue.CommitmentBank,
"copa": superglue.Copa, "copa": superglue.Copa,
"multirc": superglue.MultiRC, "multirc": superglue.MultiRC,
"wic": superglue.WordsInContext, "wic": superglue.WordsInContext,
......
...@@ -56,3 +56,11 @@ def yesno(x): ...@@ -56,3 +56,11 @@ def yesno(x):
return 'yes' return 'yes'
else: else:
return 'no' return 'no'
def trueneitherfalse(x):
if 0:
return 'true'
elif 1:
return 'neither'
elif 2:
return 'false'
\ No newline at end of file
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import numpy as np import numpy as np
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno from . common import HFTask, simple_accuracy_metric, yesno, trueneitherfalse
from lm_eval.base import rf, mean from lm_eval.base import rf, mean
class BoolQ(HFTask): class BoolQ(HFTask):
...@@ -69,6 +69,9 @@ class CommitmentBank(HFTask): ...@@ -69,6 +69,9 @@ class CommitmentBank(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_description(self):
return "Given a premise and a hypothesis, classify whether the author of the premise is committed to the truth of the hypothesis. The three possible labels are true, false or neither."
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc, include_target=True):
text = "{}\nquestion:\t{}\ttrue, false or neither?\nanswer:".format( text = "{}\nquestion:\t{}\ttrue, false or neither?\nanswer:".format(
doc["premise"], doc["premise"],
...@@ -80,29 +83,34 @@ class CommitmentBank(HFTask): ...@@ -80,29 +83,34 @@ class CommitmentBank(HFTask):
# Neither = neutral # Neither = neutral
text += " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]]) text += " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]])
return text return text
def doc_to_target(self, doc):
return trueneitherfalse(doc['label'])
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: Implement evaluation code using new framework ll_true, _ = rf.loglikelihood(ctx, ' true')
ll_neither, _ = rf.loglikelihood(ctx, ' neither')
ll_false, _ = rf.loglikelihood(ctx, ' false')
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. return ll_true, ll_neither, ll_false
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
probs = np.array([
lm.loglikelihood(ctx, ' true'),
lm.loglikelihood(ctx, ' neither'),
lm.loglikelihood(ctx, ' false'),
])
preds.append(np.argmax(probs))
return simple_accuracy_metric(preds=preds, golds=golds)
def process_results(self, doc, results):
gold = doc["label"]
acc = 1. if (np.argmax(results)) == gold else 0.
return {
"acc": acc
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
class Copa(HFTask): class Copa(HFTask):
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment