Commit a538a1ad authored by thefazzer's avatar thefazzer
Browse files

Add MultiRC Implementation

parent 2c6ff0d5
import abc import abc
import random import random
import collections import collections
import numpy as np
from sklearn.metrics import precision_recall_fscore_support as score from sklearn.metrics import precision_recall_fscore_support as score
class LM(abc.ABC): class LM(abc.ABC):
...@@ -187,6 +188,24 @@ def f1_score(items): ...@@ -187,6 +188,24 @@ def f1_score(items):
precision, recall, fscore, support = score(golds, preds) precision, recall, fscore, support = score(golds, preds)
return max(fscore) return max(fscore)
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for num in range(0, len(preds) - 1):
doc = docs[num]
pred = preds[num]
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def median(arr): def median(arr):
return arr[len(arr) // 2] return arr[len(arr) // 2]
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import numpy as np import numpy as np
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno, trueneitherfalse, truefalse from . common import HFTask, simple_accuracy_metric, yesno, trueneitherfalse, truefalse
from lm_eval.base import rf, mean, f1_score from lm_eval.base import rf, mean, f1_score, acc_all
class BoolQ(HFTask): class BoolQ(HFTask):
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
...@@ -129,7 +129,7 @@ class Copa(HFTask): ...@@ -129,7 +129,7 @@ class Copa(HFTask):
return True return True
def fewshot_description(self): def fewshot_description(self):
return "Given a premise and two alternatives, one of which has a causal relation to the premise and the other does not, choose the more plausible alternative" return "Given a premise and one alternative with a causal relation to the premise and another without, choose the more plausible alternative"
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc, include_target=True):
# Drop the period # Drop the period
...@@ -148,8 +148,8 @@ class Copa(HFTask): ...@@ -148,8 +148,8 @@ class Copa(HFTask):
return truefalse(doc['label']) return truefalse(doc['label'])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
choice1 = " " + self.convert_choice(doc["choice1"]) choice1 = " " + self.convert_choice(doc["choice1"])
choice2 = " " + self.convert_choice(doc["choice2"]) choice2 = " " + self.convert_choice(doc["choice2"])
ll_choice1, _ = rf.loglikelihood(ctx, choice1) ll_choice1, _ = rf.loglikelihood(ctx, choice1)
ll_choice2, _ = rf.loglikelihood(ctx, choice2) ll_choice2, _ = rf.loglikelihood(ctx, choice2)
...@@ -205,42 +205,37 @@ class MultiRC(HFTask): ...@@ -205,42 +205,37 @@ class MultiRC(HFTask):
def format_answer(answer, label): def format_answer(answer, label):
label_str = "True" if label else "False" label_str = "True" if label else "False"
return f"[{label_str}] {answer}" return f"[{label_str}] {answer}"
def doc_to_target(self, doc):
return truefalse(doc['label'])
def evaluate(self, docs, lm, provide_description, num_fewshot): def construct_requests(self, doc, ctx):
# TODO: Implement evaluation code using new framework true_choice = self.format_answer(answer=doc["answer"], label=True)
false_choice = self.format_answer(answer=doc["answer"], label=False)
ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. return ll_true_choice, ll_false_choice
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented. def process_results(self, doc, results):
preds = [] gold = doc["label"]
for doc in docs: pred = np.argmax(results)
ctx = self.fewshot_context( acc = 1. if pred == gold else 0.
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
true_choice = self.format_answer(answer=doc["answer"], label=True)
false_choice = self.format_answer(answer=doc["answer"], label=False)
preds.append(
lm.loglikelihood(ctx, f' {true_choice}')
> lm.loglikelihood(ctx, f' {false_choice}')
)
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return { return {
"major": acc, "acc": (pred, doc)
"minor": {"acc": acc}, }
"higher_is_better": True,
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": acc_all
} }
class WordsInContext(HFTask): class WordsInContext(HFTask):
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment