Commit 26f19561 authored by ingyuseong's avatar ingyuseong
Browse files

Add KorUnSmile task

parent cc7650d8
......@@ -56,6 +56,7 @@ from . import nsmc
from . import klue
from . import ko_translation
from . import korquad
from . import korunsmile
########################################
# Translation tasks
......@@ -319,7 +320,8 @@ TASK_REGISTRY = {
"kobest_hellaswag": kobest.HellaSwag,
"kobest_sentineg": kobest.SentiNeg,
"ko_en_translation": ko_translation.KoEnTranslation,
"en_ko_translation": ko_translation.EnKoTranslation
"en_ko_translation": ko_translation.EnKoTranslation,
"korunsmile": korunsmile.KorUnSmile
}
......
"""
Korean UnSmile Dataset
Github: https://github.com/smilegate-ai/korean_unsmile_dataset
"""
import numpy as np
from lm_eval.base import MultipleChoiceTask
from lm_eval.metrics import macro_f1_score
_CITATION = """
@misc{SmilegateAI2022KoreanUnSmileDataset,
title = {Korean UnSmile dataset: Human-annotated Multi-label Korean Hate Speech Dataset},
author = {Seonghyun Kim},
year = {2022},
howpublished = {https://github.com/smilegate-ai/korean_unsmile_dataset},
}
"""
class KorUnSmile(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "smilegate-ai/kor_unsmile"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc,self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc,self.dataset["valid"])
def _process_doc(self, doc):
out_doc = {
"title": doc["문장"],
"choices": ["여성/가족", "남성", "성소수자", "인종/국적", "연령", "지역", "종교", "기타 혐오", "악플/욕설", "clean"],
"gold": np.argmax(doc["labels"])
}
return out_doc
def doc_to_text(self, doc):
return "{}".format(doc["title"])
def doc_to_target(self, doc):
return " {}".format({0: "여성/가족", 1: "남성", 2: "성소수자", 3: "인종/국적", 4: "연령", 5: "지역", 6: "종교", 7: "기타 혐오", 8: "악플/욕설", 9: "clean"}[doc["gold"]])
def process_results(self, doc, results):
pred = np.argmax(results)
gold = doc["gold"]
return {
"f1": (gold, pred)
}
def higher_is_better(self):
return {
"f1": True
}
def aggregation(self):
return {
"f1": macro_f1_score
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment