Commit a538a1ad authored by thefazzer's avatar thefazzer
Browse files

Add MultiRC Implementation

parent 2c6ff0d5
import abc import abc
import random import random
import collections import collections
import numpy as np
from sklearn.metrics import precision_recall_fscore_support as score from sklearn.metrics import precision_recall_fscore_support as score
class LM(abc.ABC): class LM(abc.ABC):
...@@ -187,6 +188,24 @@ def f1_score(items): ...@@ -187,6 +188,24 @@ def f1_score(items):
precision, recall, fscore, support = score(golds, preds) precision, recall, fscore, support = score(golds, preds)
return max(fscore) return max(fscore)
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for num in range(0, len(preds) - 1):
doc = docs[num]
pred = preds[num]
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def median(arr): def median(arr):
return arr[len(arr) // 2] return arr[len(arr) // 2]
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import numpy as np import numpy as np
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno, trueneitherfalse, truefalse from . common import HFTask, simple_accuracy_metric, yesno, trueneitherfalse, truefalse
from lm_eval.base import rf, mean, f1_score from lm_eval.base import rf, mean, f1_score, acc_all
class BoolQ(HFTask): class BoolQ(HFTask):
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
...@@ -129,7 +129,7 @@ class Copa(HFTask): ...@@ -129,7 +129,7 @@ class Copa(HFTask):
return True return True
def fewshot_description(self): def fewshot_description(self):
return "Given a premise and two alternatives, one of which has a causal relation to the premise and the other does not, choose the more plausible alternative" return "Given a premise and one alternative with a causal relation to the premise and another without, choose the more plausible alternative"
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc, include_target=True):
# Drop the period # Drop the period
...@@ -206,41 +206,36 @@ class MultiRC(HFTask): ...@@ -206,41 +206,36 @@ class MultiRC(HFTask):
label_str = "True" if label else "False" label_str = "True" if label else "False"
return f"[{label_str}] {answer}" return f"[{label_str}] {answer}"
def evaluate(self, docs, lm, provide_description, num_fewshot): def doc_to_target(self, doc):
# TODO: Implement evaluation code using new framework return truefalse(doc['label'])
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. def construct_requests(self, doc, ctx):
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
preds = []
for doc in docs:
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
true_choice = self.format_answer(answer=doc["answer"], label=True) true_choice = self.format_answer(answer=doc["answer"], label=True)
false_choice = self.format_answer(answer=doc["answer"], label=False) false_choice = self.format_answer(answer=doc["answer"], label=False)
preds.append(
lm.loglikelihood(ctx, f' {true_choice}')
> lm.loglikelihood(ctx, f' {false_choice}')
)
# Only count as correct if all answers are labeled correctly for each question ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
question_scoring_dict = {} ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"] return ll_true_choice, ll_false_choice
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = [] def process_results(self, doc, results):
gold_label = doc["label"] == 1 gold = doc["label"]
question_scoring_dict[question_id].append(gold_label == pred) pred = np.argmax(results)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()]) acc = 1. if pred == gold else 0.
return { return {
"major": acc, "acc": (pred, doc)
"minor": {"acc": acc},
"higher_is_better": True,
} }
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": acc_all
}
class WordsInContext(HFTask): class WordsInContext(HFTask):
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment