Commit f6afabd5 authored by guijinSON's avatar guijinSON
Browse files

add csatqa

parent 11e650db
No preview for this file type
from lm_eval.base import MultipleChoiceTask
import os
import datasets class CSATQA(MultipleChoiceTask):
import json VERSION = 0
DATASET_PATH = "EleutherAI/csatqa"
_CITATION = """\ def has_training_docs(self):
""" return False
_DESCRIPTION = """\ def has_validation_docs(self):
CSAT-QA return False
"""
def has_test_docs(self):
_HOMEPAGE = "https://huggingface.co/HAERAE-HUB" return True
_LICENSE = "Proprietary" def test_docs(self):
return map(self._process_doc, self.dataset["test"])
split_names = ["WR", "GR", "RCS", "RCSS", "RCH", "LI"]
def _process_doc(self, doc):
class CSATQAConfig(datasets.BuilderConfig): choices = [doc["option#1"], doc["option#2"], doc["option#3"], doc["option#4"], doc["option#5"]]
def __init__(self, **kwargs): out_doc = {
super().__init__(version=datasets.Version("1.0.0"), **kwargs) "question": doc["question"],
"choices": choices,
"gold": int(doc['gold']),
class CSATQA(datasets.GeneratorBasedBuilder): }
BUILDER_CONFIGS = [ return out_doc
CSATQAConfig(
name=name, def doc_to_text(self, doc):
) return doc["question"]
for name in split_names
]
class WR(CSATQA):
def _info(self): DATASET_NAME = "WR"
features = datasets.Features(
{ class GR(CSATQA):
"question": datasets.Value("string"), DATASET_NAME = "GR"
"option#1": datasets.Value("string"),
"option#2": datasets.Value("string"), class RCS(CSATQA):
"option#3": datasets.Value("string"), DATASET_NAME = "RCS"
"option#4": datasets.Value("string"),
"option#5": datasets.Value("string"), class RCSS(CSATQA):
"gold": datasets.Value("int8"), DATASET_NAME = "RCSS"
}
) class RCH(CSATQA):
return datasets.DatasetInfo( DATASET_NAME = "RCH"
description=_DESCRIPTION,
features=features, class LI(CSATQA):
homepage=_HOMEPAGE, DATASET_NAME = "LI"
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
data_dir = "HAERAE-HUB/CSAT-QA"
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": os.path.join(data_dir, "data", "data.jsonl"),
},
),
]
def _generate_examples(self, filepath):
with open(filepath, encoding="utf-8") as f:
for key, row in enumerate(f):
data = json.loads(row)
if data["split"] == self.config.name:
data["gold"] = int(data["gold"]) - 1
data.pop("split")
yield key, data
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment