Commit f6afabd5 authored by guijinSON's avatar guijinSON
Browse files

add csatqa

parent 11e650db
No preview for this file type
from lm_eval.base import MultipleChoiceTask
import os
import datasets class CSATQA(MultipleChoiceTask):
import json VERSION = 0
DATASET_PATH = "EleutherAI/csatqa"
def has_training_docs(self):
return False
_CITATION = """\ def has_validation_docs(self):
""" return False
_DESCRIPTION = """\ def has_test_docs(self):
CSAT-QA return True
"""
_HOMEPAGE = "https://huggingface.co/HAERAE-HUB" def test_docs(self):
return map(self._process_doc, self.dataset["test"])
_LICENSE = "Proprietary" def _process_doc(self, doc):
choices = [doc["option#1"], doc["option#2"], doc["option#3"], doc["option#4"], doc["option#5"]]
out_doc = {
"question": doc["question"],
"choices": choices,
"gold": int(doc['gold']),
}
return out_doc
split_names = ["WR", "GR", "RCS", "RCSS", "RCH", "LI"] def doc_to_text(self, doc):
return doc["question"]
class CSATQAConfig(datasets.BuilderConfig):
def __init__(self, **kwargs):
super().__init__(version=datasets.Version("1.0.0"), **kwargs)
class WR(CSATQA):
DATASET_NAME = "WR"
class CSATQA(datasets.GeneratorBasedBuilder): class GR(CSATQA):
BUILDER_CONFIGS = [ DATASET_NAME = "GR"
CSATQAConfig(
name=name,
)
for name in split_names
]
def _info(self): class RCS(CSATQA):
features = datasets.Features( DATASET_NAME = "RCS"
{
"question": datasets.Value("string"), class RCSS(CSATQA):
"option#1": datasets.Value("string"), DATASET_NAME = "RCSS"
"option#2": datasets.Value("string"),
"option#3": datasets.Value("string"),
"option#4": datasets.Value("string"),
"option#5": datasets.Value("string"),
"gold": datasets.Value("int8"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager): class RCH(CSATQA):
data_dir = "HAERAE-HUB/CSAT-QA" DATASET_NAME = "RCH"
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": os.path.join(data_dir, "data", "data.jsonl"),
},
),
]
def _generate_examples(self, filepath): class LI(CSATQA):
with open(filepath, encoding="utf-8") as f: DATASET_NAME = "LI"
for key, row in enumerate(f):
data = json.loads(row)
if data["split"] == self.config.name:
data["gold"] = int(data["gold"]) - 1
data.pop("split")
yield key, data
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment