Commit 2c6ff0d5 authored by thefazzer's avatar thefazzer
Browse files

Add COPA implementation

parent 5165bd38
...@@ -63,4 +63,11 @@ def trueneitherfalse(x): ...@@ -63,4 +63,11 @@ def trueneitherfalse(x):
elif 1: elif 1:
return 'neither' return 'neither'
elif 2: elif 2:
return 'false' return 'false'
\ No newline at end of file
def truefalse(x):
if 0:
return 'false'
else:
return 'true'
\ No newline at end of file
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import numpy as np import numpy as np
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno, trueneitherfalse from . common import HFTask, simple_accuracy_metric, yesno, trueneitherfalse, truefalse
from lm_eval.base import rf, mean, f1_score from lm_eval.base import rf, mean, f1_score
class BoolQ(HFTask): class BoolQ(HFTask):
...@@ -128,6 +128,9 @@ class Copa(HFTask): ...@@ -128,6 +128,9 @@ class Copa(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_description(self):
return "Given a premise and two alternatives, one of which has a causal relation to the premise and the other does not, choose the more plausible alternative"
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc, include_target=True):
# Drop the period # Drop the period
connector = { connector = {
...@@ -141,24 +144,36 @@ class Copa(HFTask): ...@@ -141,24 +144,36 @@ class Copa(HFTask):
text += self.convert_choice(correct_choice) text += self.convert_choice(correct_choice)
return text return text
def evaluate(self, docs, lm, provide_description, num_fewshot): def doc_to_target(self, doc):
# TODO: Implement evaluation code using new framework return truefalse(doc['label'])
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. def construct_requests(self, doc, ctx):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
choice1 = " " + self.convert_choice(doc["choice1"]) choice1 = " " + self.convert_choice(doc["choice1"])
choice2 = " " + self.convert_choice(doc["choice2"]) choice2 = " " + self.convert_choice(doc["choice2"])
preds.append(lm.loglikelihood(ctx, choice2) > lm.loglikelihood(ctx, choice1))
return simple_accuracy_metric(preds=preds, golds=golds) ll_choice1, _ = rf.loglikelihood(ctx, choice1)
ll_choice2, _ = rf.loglikelihood(ctx, choice2)
return ll_choice1, ll_choice2
def process_results(self, doc, results):
gold = doc["label"]
pred = np.argmax(results)
acc = 1. if pred == gold else 0.
return {
"acc": acc
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
@staticmethod @staticmethod
def convert_choice(choice): def convert_choice(choice):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment