Commit f60f1753 authored by kabbi159's avatar kabbi159
Browse files

add accuracy and macro f1 score in kobest task

parent 44e4196b
......@@ -21,7 +21,7 @@ Homepage: https://huggingface.co/datasets/skt/kobest_v1
import numpy as np
from lm_eval.base import MultipleChoiceTask, rf, Task
from lm_eval.metrics import f1_score, macro_f1_score
from lm_eval.metrics import f1_score, macro_f1_score, mean
class BoolQ(Task):
......@@ -65,17 +65,23 @@ class BoolQ(Task):
pred = np.argmax(results)
gold = doc["label"]
return {
"f1": (gold, pred)
"f1": (gold, pred),
"acc": pred == gold,
"macro_f1": (gold, pred)
}
def higher_is_better(self):
return {
"f1": True
"f1": True,
"acc": True,
"macro_f1": True
}
def aggregation(self):
return {
"f1": f1_score
"f1": f1_score,
"acc": mean,
"macro_f1": macro_f1_score
}
......@@ -131,17 +137,24 @@ class COPA(Task):
pred = np.argmax(results)
gold = doc["label"]
return {
"f1": (gold, pred)
"f1": (gold, pred),
"acc": pred == gold,
"macro_f1": (gold, pred)
}
def higher_is_better(self):
return {
"f1": True
"f1": True,
"acc": True,
"macro_f1": True
}
def aggregation(self):
return {
"f1": f1_score
"f1": f1_score,
"acc": mean,
"macro_f1": macro_f1_score
}
class WiC(Task):
......@@ -185,17 +198,23 @@ class WiC(Task):
pred = np.argmax(results)
gold = doc["label"]
return {
"f1": (gold, pred)
"f1": (gold, pred),
"acc": pred == gold,
"macro_f1": (gold, pred)
}
def higher_is_better(self):
return {
"f1": True
"f1": True,
"acc": True,
"macro_f1": True
}
def aggregation(self):
return {
"f1": f1_score
"f1": f1_score,
"acc": mean,
"macro_f1": macro_f1_score
}
......@@ -239,17 +258,20 @@ class HellaSwag(MultipleChoiceTask):
pred = np.argmax(results)
gold = doc["gold"]
return {
"f1": (gold, pred)
"acc": pred == gold,
"macro_f1": (gold, pred)
}
def higher_is_better(self):
return {
"f1": True
"acc": True,
"macro_f1": True
}
def aggregation(self):
return {
"f1": macro_f1_score
"acc": mean,
"macro_f1": macro_f1_score
}
......@@ -294,15 +316,21 @@ class SentiNeg(Task):
pred = np.argmax(results)
gold = doc["label"]
return {
"f1": (gold, pred)
"f1": (gold, pred),
"acc": pred == gold,
"macro_f1": (gold, pred)
}
def higher_is_better(self):
return {
"f1": True
"f1": True,
"acc": True,
"macro_f1": True
}
def aggregation(self):
return {
"f1": f1_score
"f1": f1_score,
"acc": mean,
"macro_f1": macro_f1_score
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment