Add klue-ynat task to eval Korean language task (#310)

- Add macro_f1_score for multi-class task - Add klue-ynat task Co-authored-by: Taekyoon <tgchoi03@gmail.com>

Add klue-ynat task to eval Korean language task (#310)
- Add macro_f1_score for multi-class task - Add klue-ynat task Co-authored-by: Taekyoon <tgchoi03@gmail.com>
ff2e7aa1 · Jiwung Hyun · GitHub · 11fa0bf4 · ff2e7aa1 · ff2e7aa1
Unverified Commit ff2e7aa1 authored May 22, 2022 by Jiwung Hyun Committed by GitHub May 22, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 78 additions and 6 deletions

lm_eval/metrics.py lm_eval/metrics.py +7 -0

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +1 -0

lm_eval/tasks/klue.py lm_eval/tasks/klue.py +70 -6

No files found.
--- a/lm_eval/metrics.py
+++ b/lm_eval/metrics.py
@@ -44,6 +44,13 @@ def f1_score(items):

    return np.max(fscore)

+def macro_f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = sklearn.metrics.f1_score(golds, preds, average='macro')
+
+    return fscore

 def acc_all(items):
    # Only count as correct if all answers are labeled correctly for each question

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -304,6 +304,7 @@ TASK_REGISTRY = {
    "blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
    "blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
    "klue_sts": klue.STS,
+    "klue_ynat": klue.YNAT
    "nsmc": nsmc.NSMC,    
    "korquad": korquad.Korquad
    # Requires manual download of data.

--- a/lm_eval/tasks/klue.py
+++ b/lm_eval/tasks/klue.py
 """
-NSMC:
+KLUE
+https://arxiv.org/abs/2105.09680
+
+ Korean Language Understanding Evaluation (KLUE) benchmark is a series of datasets
+ to evaluate natural language understanding capability of Korean language models.
+ KLUE consists of 8 diverse and representative tasks, which are accessible to anyone without any restrictions.
+ With ethical considerations in mind, we deliberately design annotation guidelines
+ to obtain unambiguous annotations for all datasets. Furthermore, we build an evaluation system
+ and carefully choose evaluations metrics for every task, thus establishing fair comparison across Korean language models.
+ 
+ Homepage: https://klue-benchmark.com/
 """
+
 import numpy as np
-from lm_eval.base import rf, Task
-from lm_eval.metrics import mean, matthews_corrcoef, f1_score, yesno
+from lm_eval.base import Task, MultipleChoiceTask, rf
+from lm_eval.metrics import macro_f1_score, mean, matthews_corrcoef, f1_score, yesno
 from lm_eval.utils import general_detokenize

 _CITATION = """
@@ -22,7 +33,7 @@ class STS(Task):
    VERSION = 0
    DATASET_PATH = "klue"
    DATASET_NAME = "sts"
-
+    
    def has_training_docs(self):
        return True

@@ -62,7 +73,7 @@ class STS(Task):
            "acc": pred == gold,
            "f1": (gold, pred)
        }
-
+    
    def higher_is_better(self):
        return {
            "acc": True,
@@ -74,4 +85,57 @@ class STS(Task):
            "acc": mean,
            "f1": f1_score
        }
-        
\ No newline at end of file
+
+
+class YNAT(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = "klue"
+    DATASET_NAME = "ynat"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc,self.dataset["train"]))
+        return self._training_docs
+
+    def validation_docs(self):
+        return map(self._process_doc,self.dataset["validation"])
+
+    def _process_doc(self, doc):
+        out_doc = {
+            "title": doc["title"],
+            "choices": ["과학", "경제", "사회", "생활", "세계", "스포츠", "정치"],
+            "gold": doc["label"]
+        }
+        return out_doc
+
+    def doc_to_text(self, doc):
+        return "{}".format(doc["title"])
+
+    def doc_to_target(self, doc):
+        return " ({})".format({0: "과학", 1: "경제", 2: "사회", 3: "생활", 4: "세계", 5: "스포츠", 6: "정치"}[doc["gold"]])
+
+    def process_results(self, doc, results):
+        pred = np.argmax(results)
+        gold = doc["gold"]
+        return {
+            "f1": (gold, pred)
+        }
+
+    def higher_is_better(self):
+        return {
+            "f1": True
+        }
+
+    def aggregation(self):
+        return {
+            "f1": macro_f1_score
+        }