Commit 5165bd38 authored by thefazzer's avatar thefazzer
Browse files

Added F1_score metric

parent 68a8790c
import abc
import random
import collections
from sklearn.metrics import precision_recall_fscore_support as score
class LM(abc.ABC):
@abc.abstractmethod
......@@ -180,6 +180,13 @@ class Dataset(abc.ABC):
def mean(arr):
return sum(arr) / len(arr)
def f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
precision, recall, fscore, support = score(golds, preds)
return max(fscore)
def median(arr):
return arr[len(arr) // 2]
......
......@@ -3,7 +3,7 @@
import numpy as np
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno, trueneitherfalse
from lm_eval.base import rf, mean
from lm_eval.base import rf, mean, f1_score
class BoolQ(HFTask):
DATASET_PATH = "super_glue"
......@@ -96,10 +96,12 @@ class CommitmentBank(HFTask):
def process_results(self, doc, results):
gold = doc["label"]
acc = 1. if (np.argmax(results)) == gold else 0.
pred = np.argmax(results)
acc = 1. if pred == gold else 0.
return {
"acc": acc
"acc": acc,
"f1": (pred, gold)
}
def higher_is_better(self):
......@@ -109,7 +111,8 @@ class CommitmentBank(HFTask):
def aggregation(self):
return {
"acc": mean
"acc": mean,
"f1": f1_score
}
class Copa(HFTask):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment