gaokaobench.py

import json
import os
import re
from copy import deepcopy
from typing import Dict, List

from colossalai.logging import DistributedLogger

from .base import BaseDataset

multi_choice_datasets = [
    "Chinese Lang and Usage MCQs",
    "Chinese Modern Lit",
    "English Fill in Blanks",
    "English Reading Comp",
    "Geography MCQs",
    "Physics MCQs",
    "English Cloze Test",
]

chinese_qa_datasets = [
    "Biology MCQs",
    "Chemistry MCQs",
    "Chinese Lang and Usage MCQs",
    "Chinese Modern Lit",
    "Geography MCQs",
    "History MCQs",
    "Math I MCQs",
    "Math II MCQs",
    "Physics MCQs",
    "Political Science MCQs",
]
english_qa_datasets = ["English MCQs", "English Fill in Blanks", "English Reading Comp", "English Cloze Test"]

default_inference_kwargs = {
    "calculate_loss": True,
    "all_classes": None,
    "language": "Chinese",
    "pretrain": False,
    "max_new_tokens": 32,
}


def get_all_classes(instruction: str):
    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    pattern = r"([A-Z]\. |[A-Z]．|[A-Z]\.)"
    options = sorted(list(set(re.findall(pattern, instruction))))
    options = sorted(list(set([string[0] for string in options])))

    for i in range(len(options)):
        if options[i] == letters[i]:
            continue
        else:
            return options[0:i]
    return options


class GaoKaoBenchDataset(BaseDataset):
    """
    Dataset class for GAOKAO-Bench dataset.
    Data source: https://github.com/OpenLMLab/GAOKAO-Bench/tree/main/data
    This dataset class will convert the original dataset into the inference dataset.

    A few typos needed to be manually corrected in the origin dataset, some of the following is fixed.
    Issue link: https://github.com/OpenLMLab/GAOKAO-Bench/issues/20
    1. Option C missing in index 111 in 2010-2022_Chemistry_MCQs.json
    2. Option B missing "." after it in index 16 in 2012-2022_English_Cloze_Test.json
    3. Option G missing "." after it in index 23 in 2012-2022_English_Cloze_Test.json
    """

    @staticmethod
    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
        dataset = {"test": {}}
        for category in ["Fill-in-the-blank_Questions", "Multiple-choice_Questions", "Open-ended_Questions"]:
            files = os.listdir(os.path.join(path, "data", category))
            files.sort()

            for file in files:
                subject = file[10:-5].split("_")
                subject = " ".join(subject)
                dataset["test"][subject] = {"data": []}

                file_dir = os.path.join(path, "data", category, file)

                with open(file_dir, encoding="utf-8") as f:
                    data = json.load(f)

                    # It's been tested that each data sample in one subcategory have same inference arguments.
                    inference_kwargs = deepcopy(default_inference_kwargs)
                    if category == "Multiple-choice_Questions" and subject not in multi_choice_datasets:
                        all_classes = get_all_classes(data["example"][0]["question"])
                        inference_kwargs["all_classes"] = all_classes
                    if subject in english_qa_datasets:
                        inference_kwargs["language"] = "English"
                    if subject in chinese_qa_datasets:
                        inference_kwargs["language"] = "Chinese"

                    dataset["test"][subject]["inference_kwargs"] = inference_kwargs

                    for sample in data["example"]:
                        # Convert multi-choice answers to a single string.
                        # We will convert it back when evaluating.
                        # We do this because if target is a list, it should be only used for multiple target answers.
                        if subject in multi_choice_datasets:
                            sample["answer"] = "".join(sample["answer"])

                        if isinstance(sample["answer"], list) and len(sample["answer"]) == 1:
                            sample["answer"] = sample["answer"][0]

                        data_sample = {
                            "dataset": "gaokaobench",
                            "split": "test",
                            "category": f"{category[:-10]}-{subject}",
                            "instruction": sample["question"].strip() + "\n答案：",
                            "input": "",
                            "output": "",
                            "target": sample["answer"],
                        }

                        dataset["test"][subject]["data"].append(data_sample)

        return dataset