hendrycks_test.py 5.24 KB
Newer Older
1
2
3
4
5
"""
Measuring Massive Multitask Language Understanding
https://arxiv.org/pdf/2009.03300.pdf

The Hendryck's Test is a benchmark that measured a text model’s multitask accuracy.
Fabrizio Milo's avatar
Fabrizio Milo committed
6
The test covers 57 tasks including elementary mathematics, US history, computer
7
8
science, law, and more. To attain high accuracy on this test, models must possess
extensive world knowledge and problem solving ability. By comprehensively evaluating
Fabrizio Milo's avatar
Fabrizio Milo committed
9
10
the breadth and depth of a model’s academic and professional understanding,
Hendryck's Test can be used to analyze models across many tasks and to identify
11
12
13
14
important shortcomings.

Homepage: https://github.com/hendrycks/test
"""
Jonathan Tow's avatar
Jonathan Tow committed
15
16
from lm_eval.base import MultipleChoiceTask

17
18
19
20
21
22
23
24
25
_CITATION = """
@article{hendryckstest2021,
    title={Measuring Massive Multitask Language Understanding},
    author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
    journal={Proceedings of the International Conference on Learning Representations (ICLR)},
    year={2021}
}
"""

Jonathan Tow's avatar
Jonathan Tow committed
26

Fabrizio Milo's avatar
Fabrizio Milo committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
SUBJECTS = [
    "abstract_algebra",
    "anatomy",
    "astronomy",
    "business_ethics",
    "clinical_knowledge",
    "college_biology",
    "college_chemistry",
    "college_computer_science",
    "college_mathematics",
    "college_medicine",
    "college_physics",
    "computer_security",
    "conceptual_physics",
    "econometrics",
    "electrical_engineering",
    "elementary_mathematics",
    "formal_logic",
    "global_facts",
    "high_school_biology",
    "high_school_chemistry",
    "high_school_computer_science",
    "high_school_european_history",
    "high_school_geography",
    "high_school_government_and_politics",
    "high_school_macroeconomics",
    "high_school_mathematics",
    "high_school_microeconomics",
    "high_school_physics",
    "high_school_psychology",
    "high_school_statistics",
    "high_school_us_history",
    "high_school_world_history",
    "human_aging",
    "human_sexuality",
    "international_law",
    "jurisprudence",
    "logical_fallacies",
    "machine_learning",
    "management",
    "marketing",
    "medical_genetics",
    "miscellaneous",
    "moral_disputes",
    "moral_scenarios",
    "nutrition",
    "philosophy",
    "prehistory",
    "professional_accounting",
    "professional_law",
    "professional_medicine",
    "professional_psychology",
    "public_relations",
    "security_studies",
    "sociology",
    "us_foreign_policy",
    "virology",
    "world_religions",
]
Andy Zou's avatar
Andy Zou committed
86
87
88
89
90


def create_all_tasks():
    """Creates a dictionary of tasks from a list of subjects
    :return: {task_name: task}
Andy Zou's avatar
Andy Zou committed
91
        e.g. {hendrycksTest-abstract_algebra: Task, hendrycksTest-anatomy: Task}
Andy Zou's avatar
Andy Zou committed
92
    """
Fabrizio Milo's avatar
Fabrizio Milo committed
93
    return {f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS}
Andy Zou's avatar
Andy Zou committed
94

Jonathan Tow's avatar
Jonathan Tow committed
95

Andy Zou's avatar
Andy Zou committed
96
97
98
99
def create_task(subject):
    class HendrycksTest(GeneralHendrycksTest):
        def __init__(self):
            super().__init__(subject)
Fabrizio Milo's avatar
Fabrizio Milo committed
100

Andy Zou's avatar
Andy Zou committed
101
102
    return HendrycksTest

Jonathan Tow's avatar
Jonathan Tow committed
103

Andy Zou's avatar
Andy Zou committed
104
class GeneralHendrycksTest(MultipleChoiceTask):
Oleh Shliazhko's avatar
Oleh Shliazhko committed
105
    VERSION = 1
106
    DATASET_PATH = "cais/mmlu"
Jonathan Tow's avatar
Jonathan Tow committed
107
    DATASET_NAME = None
Andy Zou's avatar
Andy Zou committed
108
109

    def __init__(self, subject):
Jonathan Tow's avatar
Jonathan Tow committed
110
        self.DATASET_NAME = subject
Andy Zou's avatar
Andy Zou committed
111
112
113
        super().__init__()

    def has_training_docs(self):
114
        return True
Andy Zou's avatar
Andy Zou committed
115
116

    def has_validation_docs(self):
117
        return True
Andy Zou's avatar
Andy Zou committed
118
119
120
121

    def has_test_docs(self):
        return True

Jonathan Tow's avatar
Jonathan Tow committed
122
    def validation_docs(self):
Jon Tow's avatar
Jon Tow committed
123
        return map(self._process_doc, self.dataset["validation"])
Jonathan Tow's avatar
Jonathan Tow committed
124
125

    def test_docs(self):
Jon Tow's avatar
Jon Tow committed
126
        return map(self._process_doc, self.dataset["test"])
Jonathan Tow's avatar
Jonathan Tow committed
127

Oleh Shliazhko's avatar
Oleh Shliazhko committed
128
129
130
131
    def _format_subject(self, subject):
        words = subject.split("_")
        return " ".join(words)

132
133
    def fewshot_context(self, doc, num_fewshot, **kwargs):
        subject = self.DATASET_NAME
Oleh Shliazhko's avatar
Oleh Shliazhko committed
134
        description = f"The following are multiple choice questions (with answers) about {self._format_subject(subject)}."
135
136
137
        kwargs["description"] = description
        return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)

Jon Tow's avatar
Jon Tow committed
138
    def _process_doc(self, doc):
Jonathan Tow's avatar
Jonathan Tow committed
139
        def format_example(doc, keys):
140
            """
141
            <prompt>
Fabrizio Milo's avatar
Fabrizio Milo committed
142
143
144
145
146
            A. <choice1>
            B. <choice2>
            C. <choice3>
            D. <choice4>
            Answer:
147
            """
148

Oleh Shliazhko's avatar
Oleh Shliazhko committed
149
            question = doc["question"].strip()
150
            choices = "".join(
Fabrizio Milo's avatar
Fabrizio Milo committed
151
152
                [f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])]
            )
153
            prompt = f"{question}\n{choices}Answer:"
154
            return prompt
Fabrizio Milo's avatar
Fabrizio Milo committed
155
156

        keys = ["A", "B", "C", "D"]
Jonathan Tow's avatar
Jonathan Tow committed
157
        return {
Jonathan Tow's avatar
Jonathan Tow committed
158
            "query": format_example(doc, keys),
159
160
            "choices": keys,
            "gold": doc["answer"],
Jonathan Tow's avatar
Jonathan Tow committed
161
        }
Andy Zou's avatar
Andy Zou committed
162

163
    def fewshot_examples(self, k, rnd):
Fabrizio Milo's avatar
Fabrizio Milo committed
164
        # fewshot_examples is not just sampling from train_docs because dev is
Leo Gao's avatar
Leo Gao committed
165
        # in the same distribution as val/test but auxiliary_train isn't
166
        if self._fewshot_docs is None:
Jon Tow's avatar
Jon Tow committed
167
            self._fewshot_docs = list(map(self._process_doc, self.dataset["dev"]))
168

Oleh Shliazhko's avatar
Oleh Shliazhko committed
169
        # use the unchanged order of the dev set without sampling,
Oleh Shliazhko's avatar
Oleh Shliazhko committed
170
        # just as in the original code https://github.com/hendrycks/test/blob/master/evaluate.py#L28
Oleh Shliazhko's avatar
Oleh Shliazhko committed
171
        return self._fewshot_docs[:k]
Andy Zou's avatar
Andy Zou committed
172

Jonathan Tow's avatar
Jonathan Tow committed
173
174
    def doc_to_text(self, doc):
        return doc["query"]
175
176
177
178
179
180

    def should_decontaminate(self):
        return True

    def doc_to_decontamination_query(self, doc):
        return doc["query"]