kosbi.py


# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Korean Offensive Language Dataset"""

import json
import datasets


_CITATION = """\
@inproceedings{lee2023kosbi,
                title={KoSBi: A Dataset for Mitigating Social Bias Risks Towards Safer Large Language Model Application},
                author={Hwaran Lee and Seokhee Hong and Joonsuk Park and Takyoung Kim and Gunhee Kim and Jung-Woo Ha},
                booktitle={Proceedings of the 61th Annual Meeting of the Association for Computational Linguistics: Industry Track},
                year={2023}
}
"""

_DESCRIPTION = """\
This is a korean social bias dataset.
The total number of (context, sentence) pairs has increased to almost 68k, with 34.2k safe sentences and 33.8k unsafe sentences.
"""

_HOMEPAGE = "https://github.com/naver-ai/korean-safety-benchmarks/"

_LICENSE = "MIT License"

_URL = "https://raw.githubusercontent.com/naver-ai/korean-safety-benchmarks/main/data/KoSBi/"
_URLs = {
    "train": _URL + "kosbi_v2_train.json",
    "valid": _URL + "kosbi_v2_valid.json",
    "test": _URL + "kosbi_v2_test.json",
}


# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
class KoSBi(datasets.GeneratorBasedBuilder):
    """Korean Social Bias Dataset"""

    VERSION = datasets.Version("1.1.0")

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "context": datasets.Value("string"),
                    "sentence": datasets.Value("string"),
                    "context_label": datasets.ClassLabel(names=["unsafe", "undecided" ,"safe"]),
                    "sentence_label": datasets.ClassLabel(names=["unsafe", "safe"])
                }
            ),
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        downloaded_files = dl_manager.download_and_extract(_URLs)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": downloaded_files["train"],
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepath": downloaded_files["valid"],
                    "split": "validation",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath": downloaded_files["test"],
                    "split": "test",
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        with open(filepath, "r") as f:
            data = json.loads(f.read())
            for id_, row in enumerate(data):
                yield id_, {
                    "context": row["context"],
                    "sentence": row["sentence"],
                    "context_label": row["context_label"],
                    "sentence_label": row["sentence_label"]
                }