gaokaobench.py 4.54 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import json
import os
import re
from copy import deepcopy
from typing import Dict, List

from colossalai.logging import DistributedLogger

from .base import BaseDataset

multi_choice_datasets = [
    "Chinese Lang and Usage MCQs",
    "Chinese Modern Lit",
    "English Fill in Blanks",
    "English Reading Comp",
    "Geography MCQs",
    "Physics MCQs",
    "English Cloze Test",
]

chinese_qa_datasets = [
    "Biology MCQs",
    "Chemistry MCQs",
    "Chinese Lang and Usage MCQs",
    "Chinese Modern Lit",
    "Geography MCQs",
    "History MCQs",
    "Math I MCQs",
    "Math II MCQs",
    "Physics MCQs",
    "Political Science MCQs",
]
english_qa_datasets = ["English MCQs", "English Fill in Blanks", "English Reading Comp", "English Cloze Test"]

default_inference_kwargs = {
    "calculate_loss": True,
    "all_classes": None,
    "language": "Chinese",
    "pretrain": False,
    "max_new_tokens": 32,
}


def get_all_classes(instruction: str):
    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    pattern = r"([A-Z]\. |[A-Z].|[A-Z]\.)"
    options = sorted(list(set(re.findall(pattern, instruction))))
    options = sorted(list(set([string[0] for string in options])))

    for i in range(len(options)):
        if options[i] == letters[i]:
            continue
        else:
            return options[0:i]
    return options


class GaoKaoBenchDataset(BaseDataset):
    """
    Dataset class for GAOKAO-Bench dataset.
    Data source: https://github.com/OpenLMLab/GAOKAO-Bench/tree/main/data
    This dataset class will convert the original dataset into the inference dataset.

    A few typos needed to be manually corrected in the origin dataset, some of the following is fixed.
    Issue link: https://github.com/OpenLMLab/GAOKAO-Bench/issues/20
    1. Option C missing in index 111 in 2010-2022_Chemistry_MCQs.json
    2. Option B missing "." after it in index 16 in 2012-2022_English_Cloze_Test.json
    3. Option G missing "." after it in index 23 in 2012-2022_English_Cloze_Test.json
    """

    @staticmethod
    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
        dataset = {"test": {}}
        for category in ["Fill-in-the-blank_Questions", "Multiple-choice_Questions", "Open-ended_Questions"]:
            files = os.listdir(os.path.join(path, "data", category))
            files.sort()

            for file in files:
                subject = file[10:-5].split("_")
                subject = " ".join(subject)
                dataset["test"][subject] = {"data": []}

                file_dir = os.path.join(path, "data", category, file)

                with open(file_dir, encoding="utf-8") as f:
                    data = json.load(f)

                    # It's been tested that each data sample in one subcategory have same inference arguments.
                    inference_kwargs = deepcopy(default_inference_kwargs)
                    if category == "Multiple-choice_Questions" and subject not in multi_choice_datasets:
                        all_classes = get_all_classes(data["example"][0]["question"])
                        inference_kwargs["all_classes"] = all_classes
                    if subject in english_qa_datasets:
                        inference_kwargs["language"] = "English"
                    if subject in chinese_qa_datasets:
                        inference_kwargs["language"] = "Chinese"

                    dataset["test"][subject]["inference_kwargs"] = inference_kwargs

                    for sample in data["example"]:
                        # Convert multi-choice answers to a single string.
                        # We will convert it back when evaluating.
                        # We do this because if target is a list, it should be only used for multiple target answers.
                        if subject in multi_choice_datasets:
                            sample["answer"] = "".join(sample["answer"])

                        if isinstance(sample["answer"], list) and len(sample["answer"]) == 1:
                            sample["answer"] = sample["answer"][0]

                        data_sample = {
                            "dataset": "gaokaobench",
                            "split": "test",
                            "category": f"{category[:-10]}-{subject}",
                            "instruction": sample["question"].strip() + "\n答案:",
                            "input": "",
                            "output": "",
                            "target": sample["answer"],
                        }

                        dataset["test"][subject]["data"].append(data_sample)

        return dataset