Add kobest benchmark tasks to eval Korean language task (#329)

* feat: add kobest task - kobest-boolq, kobest-wic, kobest-sentineg, kobest-copa, kobest-hellaswag Co-authored-by: Taekyoon <tgchoi03@gmail.com>

Add kobest benchmark tasks to eval Korean language task (#329)
* feat: add kobest task - kobest-boolq, kobest-wic, kobest-sentineg, kobest-copa, kobest-hellaswag Co-authored-by: Taekyoon <tgchoi03@gmail.com>
7ce584bd · Jiwung Hyun · GitHub · ff2e7aa1 · 7ce584bd · 7ce584bd
Unverified Commit 7ce584bd authored Jun 05, 2022 by Jiwung Hyun Committed by GitHub Jun 05, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 319 additions and 7 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +11 -7

lm_eval/tasks/kobest.py lm_eval/tasks/kobest.py +308 -0

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -51,13 +51,11 @@ from . import blimp
 from . import asdiv
 from . import gsm8k
 from . import storycloze
+from . import kobest
 from . import nsmc
 from . import klue
 from . import korquad
 ########################################
 # Translation tasks
 ########################################
@@ -303,14 +301,20 @@ TASK_REGISTRY = {
    "blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance,
    "blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
    "blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
-    "klue_sts": klue.STS,
-    "klue_ynat": klue.YNAT
-    "nsmc": nsmc.NSMC,    
-    "korquad": korquad.Korquad
    # Requires manual download of data.
    # "storycloze_2016": storycloze.StoryCloze2016,
    # "storycloze_2018": storycloze.StoryCloze2018,
    # "sat": sat.SATAnalogies,
+    "klue_sts": klue.STS,
+    "klue_ynat": klue.YNAT
+    "nsmc": nsmc.NSMC,    
+    "korquad": korquad.Korquad
+    "kobest_boolq": kobest.BoolQ,
+    "kobest_copa": kobest.COPA,
+    "kobest_wic": kobest.WiC,
+    "kobest_hellaswag": kobest.HellaSwag,
+    "kobest_sentineg": kobest.SentiNeg
 }

--- a/lm_eval/tasks/kobest.py
+++ b/lm_eval/tasks/kobest.py
+"""
+KOBEST
+https://arxiv.org/abs/2204.04541
+A well-formulated benchmark plays a critical role in spurring advancements 
+in the natural language processing (NLP) field, as it allows objective and
+precise evaluation of diverse models. As modern language models (LMs) have 
+become more elaborate and sophisticated, more difficult benchmarks that require
+linguistic knowledge and reasoning have been proposed. However, most of these
+benchmarks only support English, and great effort is necessary to construct
+benchmarks for other low resource languages. To this end, we propose a new
+benchmark named Korean balanced evaluation of significant tasks (KoBEST),
+which consists of five Korean-language downstream tasks. Professional Korean
+linguists designed the tasks that require advanced Korean linguistic knowledge.
+Moreover, our data is purely annotated by humans and thoroughly reviewed to
+guarantee high data quality. We also provide baseline models and human performance
+results. Our dataset is available on the Huggingface.
+Homepage: https://huggingface.co/datasets/skt/kobest_v1
+"""
+import numpy as np
+from lm_eval.base import MultipleChoiceTask, rf, Task
+from lm_eval.metrics import f1_score, macro_f1_score
+class BoolQ(Task):
+    VERSION = 0
+    DATASET_PATH = "skt/kobest_v1"
+    DATASET_NAME = "boolq"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def test_docs(self):
+        return self.dataset["test"]
+    def doc_to_text(self, doc):
+        return "{} 질문: {} 답변: ".format(doc["paragraph"], doc["question"])
+    def doc_to_target(self, doc):
+        return " {}".format({0: "아니오.", 1: "예."})
+    def construct_requests(self, doc, ctx):
+        ll_no, _ = rf.loglikelihood(ctx, " 아니오.")
+        ll_yes, _ = rf.loglikelihood(ctx, " 예.")
+        return ll_no, ll_yes
+    def process_results(self, doc, results):
+        pred = np.argmax(results)
+        gold = doc["label"]
+        return {
+            "f1": (gold, pred)
+        }
+    def higher_is_better(self):
+        return {
+            "f1": True
+        }
+    def aggregation(self):
+        return {
+            "f1": f1_score
+        }
+class COPA(Task):
+    VERSION = 0
+    DATASET_PATH = "skt/kobest_v1"
+    DATASET_NAME = "copa"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def test_docs(self):
+        return self.dataset["test"]
+    def doc_to_text(self, doc):
+        '''
+        Connector: “왜냐하면” if Question is “원인” else “그래서”
+        Format: “{Premise} {Connector} {Answer Alternative}”
+        '''
+        connector = {
+            "원인": "왜냐하면",
+            "결과": "그래서",
+        }[doc["question"].strip()]
+        return doc["premise"] + f" {connector}"
+    def doc_to_target(self, doc):
+        correct_choice = doc["alternative_1"] if doc["label"] == 0 else doc["alternative_2"]
+        return " " + correct_choice
+    def construct_requests(self, doc, ctx):
+        ll_choice1, _ = rf.loglikelihood(ctx, " "+doc["alternative_1"])
+        ll_choice2, _ = rf.loglikelihood(ctx, " "+doc["alternative_2"])
+        return ll_choice1, ll_choice2
+    def process_results(self, doc, results):
+        pred = np.argmax(results)
+        gold = doc["label"]
+        return {
+            "f1": (gold, pred)
+        }
+    def higher_is_better(self):
+        return {
+            "f1": True
+        }
+    def aggregation(self):
+        return {
+            "f1": f1_score
+        }
+class WiC(Task):
+    VERSION = 0
+    DATASET_PATH = "skt/kobest_v1"
+    DATASET_NAME = "wic"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def test_docs(self):
+        return self.dataset["test"]
+    def doc_to_text(self, doc):
+        return "문장1: {} 문장2: {} 두 문장에서 {}가 같은 뜻으로 쓰였나?".format(doc["context_1"], doc["context_2"], doc["word"])
+    def doc_to_target(self, doc):
+        return " {}".format({0: "아니오", 1: "예"})
+    def construct_requests(self, doc, ctx):
+        ll_no, _ = rf.loglikelihood(ctx, " 아니오")
+        ll_yes, _ = rf.loglikelihood(ctx, " 예")
+        return ll_no, ll_yes
+    def process_results(self, doc, results):
+        pred = np.argmax(results)
+        gold = doc["label"]
+        return {
+            "f1": (gold, pred)
+        }
+    def higher_is_better(self):
+        return {
+            "f1": True
+        }
+    def aggregation(self):
+        return {
+            "f1": f1_score
+        }
+class HellaSwag(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = "skt/kobest_v1"
+    DATASET_NAME = "hellaswag"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
+        return self._training_docs
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+    def test_docs(self):
+        return map(self._process_doc, self.dataset["test"])
+    def _process_doc(self, doc):
+        out_doc = {
+            "query": "문장: {}".format(doc["context"]),
+            "choices": [doc["ending_1"], doc["ending_2"], doc["ending_3"], doc["ending_4"]],
+            "gold": int(doc['label']),
+        }
+        return out_doc
+    def doc_to_text(self, doc):
+        return doc["query"]
+    def process_results(self, doc, results):
+        pred = np.argmax(results)
+        gold = doc["gold"]
+        return {
+            "f1": (gold, pred)
+        }
+    def higher_is_better(self):
+        return {
+            "f1": True
+        }
+    def aggregation(self):
+        return {
+            "f1": macro_f1_score
+        }
+class SentiNeg(Task):
+    VERSION = 0
+    DATASET_PATH = "skt/kobest_v1"
+    DATASET_NAME = "sentineg"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+    def validation_docs(self):
+        return self.dataset["validation"]
+    def test_docs(self):
+        return self.dataset["test"]
+    def doc_to_text(self, doc):
+        return "문장: {} 긍부정:".format(doc["sentence"])
+    def doc_to_target(self, doc):
+        return " {}".format({0: "부정", 1: "긍정"})
+    def construct_requests(self, doc, ctx):
+        ll_no, _ = rf.loglikelihood(ctx, " 부정")
+        ll_yes, _ = rf.loglikelihood(ctx, " 긍정")
+        return ll_no, ll_yes
+    def process_results(self, doc, results):
+        pred = np.argmax(results)
+        gold = doc["label"]
+        return {
+            "f1": (gold, pred)
+        }
+    def higher_is_better(self):
+        return {
+            "f1": True
+        }
+    def aggregation(self):
+        return {
+            "f1": f1_score
+        }