Unverified Commit ff2e7aa1 authored by Jiwung Hyun's avatar Jiwung Hyun Committed by GitHub
Browse files

Add klue-ynat task to eval Korean language task (#310)



- Add macro_f1_score for multi-class task
- Add klue-ynat task
Co-authored-by: default avatarTaekyoon <tgchoi03@gmail.com>
parent 11fa0bf4
......@@ -44,6 +44,13 @@ def f1_score(items):
return np.max(fscore)
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds, average='macro')
return fscore
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
......
......@@ -304,6 +304,7 @@ TASK_REGISTRY = {
"blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
"blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
"klue_sts": klue.STS,
"klue_ynat": klue.YNAT
"nsmc": nsmc.NSMC,
"korquad": korquad.Korquad
# Requires manual download of data.
......
"""
NSMC:
KLUE
https://arxiv.org/abs/2105.09680
Korean Language Understanding Evaluation (KLUE) benchmark is a series of datasets
to evaluate natural language understanding capability of Korean language models.
KLUE consists of 8 diverse and representative tasks, which are accessible to anyone without any restrictions.
With ethical considerations in mind, we deliberately design annotation guidelines
to obtain unambiguous annotations for all datasets. Furthermore, we build an evaluation system
and carefully choose evaluations metrics for every task, thus establishing fair comparison across Korean language models.
Homepage: https://klue-benchmark.com/
"""
import numpy as np
from lm_eval.base import rf, Task
from lm_eval.metrics import mean, matthews_corrcoef, f1_score, yesno
from lm_eval.base import Task, MultipleChoiceTask, rf
from lm_eval.metrics import macro_f1_score, mean, matthews_corrcoef, f1_score, yesno
from lm_eval.utils import general_detokenize
_CITATION = """
......@@ -22,7 +33,7 @@ class STS(Task):
VERSION = 0
DATASET_PATH = "klue"
DATASET_NAME = "sts"
def has_training_docs(self):
return True
......@@ -62,7 +73,7 @@ class STS(Task):
"acc": pred == gold,
"f1": (gold, pred)
}
def higher_is_better(self):
return {
"acc": True,
......@@ -74,4 +85,57 @@ class STS(Task):
"acc": mean,
"f1": f1_score
}
\ No newline at end of file
class YNAT(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "klue"
DATASET_NAME = "ynat"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc,self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc,self.dataset["validation"])
def _process_doc(self, doc):
out_doc = {
"title": doc["title"],
"choices": ["과학", "경제", "사회", "생활", "세계", "스포츠", "정치"],
"gold": doc["label"]
}
return out_doc
def doc_to_text(self, doc):
return "{}".format(doc["title"])
def doc_to_target(self, doc):
return " ({})".format({0: "과학", 1: "경제", 2: "사회", 3: "생활", 4: "세계", 5: "스포츠", 6: "정치"}[doc["gold"]])
def process_results(self, doc, results):
pred = np.argmax(results)
gold = doc["gold"]
return {
"f1": (gold, pred)
}
def higher_is_better(self):
return {
"f1": True
}
def aggregation(self):
return {
"f1": macro_f1_score
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment