hendrycks_test.py 5.01 KB
Newer Older
1
2
3
4
5
"""
Measuring Massive Multitask Language Understanding
https://arxiv.org/pdf/2009.03300.pdf

The Hendryck's Test is a benchmark that measured a text model’s multitask accuracy.
Fabrizio Milo's avatar
Fabrizio Milo committed
6
The test covers 57 tasks including elementary mathematics, US history, computer
7
8
science, law, and more. To attain high accuracy on this test, models must possess
extensive world knowledge and problem solving ability. By comprehensively evaluating
Fabrizio Milo's avatar
Fabrizio Milo committed
9
10
the breadth and depth of a model’s academic and professional understanding,
Hendryck's Test can be used to analyze models across many tasks and to identify
11
12
13
14
important shortcomings.

Homepage: https://github.com/hendrycks/test
"""
Jonathan Tow's avatar
Jonathan Tow committed
15
16
from lm_eval.base import MultipleChoiceTask

17
18
19
20
21
22
23
24
25
_CITATION = """
@article{hendryckstest2021,
    title={Measuring Massive Multitask Language Understanding},
    author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
    journal={Proceedings of the International Conference on Learning Representations (ICLR)},
    year={2021}
}
"""

Jonathan Tow's avatar
Jonathan Tow committed
26

Fabrizio Milo's avatar
Fabrizio Milo committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
SUBJECTS = [
    "abstract_algebra",
    "anatomy",
    "astronomy",
    "business_ethics",
    "clinical_knowledge",
    "college_biology",
    "college_chemistry",
    "college_computer_science",
    "college_mathematics",
    "college_medicine",
    "college_physics",
    "computer_security",
    "conceptual_physics",
    "econometrics",
    "electrical_engineering",
    "elementary_mathematics",
    "formal_logic",
    "global_facts",
    "high_school_biology",
    "high_school_chemistry",
    "high_school_computer_science",
    "high_school_european_history",
    "high_school_geography",
    "high_school_government_and_politics",
    "high_school_macroeconomics",
    "high_school_mathematics",
    "high_school_microeconomics",
    "high_school_physics",
    "high_school_psychology",
    "high_school_statistics",
    "high_school_us_history",
    "high_school_world_history",
    "human_aging",
    "human_sexuality",
    "international_law",
    "jurisprudence",
    "logical_fallacies",
    "machine_learning",
    "management",
    "marketing",
    "medical_genetics",
    "miscellaneous",
    "moral_disputes",
    "moral_scenarios",
    "nutrition",
    "philosophy",
    "prehistory",
    "professional_accounting",
    "professional_law",
    "professional_medicine",
    "professional_psychology",
    "public_relations",
    "security_studies",
    "sociology",
    "us_foreign_policy",
    "virology",
    "world_religions",
]
Andy Zou's avatar
Andy Zou committed
86
87
88
89
90


def create_all_tasks():
    """Creates a dictionary of tasks from a list of subjects
    :return: {task_name: task}
Andy Zou's avatar
Andy Zou committed
91
        e.g. {hendrycksTest-abstract_algebra: Task, hendrycksTest-anatomy: Task}
Andy Zou's avatar
Andy Zou committed
92
    """
Fabrizio Milo's avatar
Fabrizio Milo committed
93
    return {f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS}
Andy Zou's avatar
Andy Zou committed
94

Jonathan Tow's avatar
Jonathan Tow committed
95

Andy Zou's avatar
Andy Zou committed
96
97
98
99
def create_task(subject):
    class HendrycksTest(GeneralHendrycksTest):
        def __init__(self):
            super().__init__(subject)
Fabrizio Milo's avatar
Fabrizio Milo committed
100

Andy Zou's avatar
Andy Zou committed
101
102
    return HendrycksTest

Jonathan Tow's avatar
Jonathan Tow committed
103

Andy Zou's avatar
Andy Zou committed
104
class GeneralHendrycksTest(MultipleChoiceTask):
Leo Gao's avatar
Leo Gao committed
105
    VERSION = 0
106
    DATASET_PATH = "cais/mmlu"
Jonathan Tow's avatar
Jonathan Tow committed
107
    DATASET_NAME = None
Andy Zou's avatar
Andy Zou committed
108
109

    def __init__(self, subject):
Jonathan Tow's avatar
Jonathan Tow committed
110
        self.DATASET_NAME = subject
Andy Zou's avatar
Andy Zou committed
111
112
113
        super().__init__()

    def has_training_docs(self):
114
        return True
Andy Zou's avatar
Andy Zou committed
115
116

    def has_validation_docs(self):
117
        return True
Andy Zou's avatar
Andy Zou committed
118
119
120
121

    def has_test_docs(self):
        return True

Jonathan Tow's avatar
Jonathan Tow committed
122
    def validation_docs(self):
Jon Tow's avatar
Jon Tow committed
123
        return map(self._process_doc, self.dataset["validation"])
Jonathan Tow's avatar
Jonathan Tow committed
124
125

    def test_docs(self):
Jon Tow's avatar
Jon Tow committed
126
        return map(self._process_doc, self.dataset["test"])
Jonathan Tow's avatar
Jonathan Tow committed
127

128
129
130
131
132
133
    def fewshot_context(self, doc, num_fewshot, **kwargs):
        subject = self.DATASET_NAME
        description = f"The following are multiple choice questions (with answers) about {subject}."
        kwargs["description"] = description
        return super().fewshot_context(doc=doc, num_fewshot=num_fewshot, **kwargs)

Jon Tow's avatar
Jon Tow committed
134
    def _process_doc(self, doc):
Jonathan Tow's avatar
Jonathan Tow committed
135
        def format_example(doc, keys):
136
            """
137
            <prompt>
Fabrizio Milo's avatar
Fabrizio Milo committed
138
139
140
141
142
            A. <choice1>
            B. <choice2>
            C. <choice3>
            D. <choice4>
            Answer:
143
            """
144
145
146

            question = doc["question"]
            choices = "".join(
Fabrizio Milo's avatar
Fabrizio Milo committed
147
148
                [f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])]
            )
149
            prompt = f"{question}\n{choices}Answer:"
150
            return prompt
Fabrizio Milo's avatar
Fabrizio Milo committed
151
152

        keys = ["A", "B", "C", "D"]
Jonathan Tow's avatar
Jonathan Tow committed
153
        return {
Jonathan Tow's avatar
Jonathan Tow committed
154
            "query": format_example(doc, keys),
155
156
            "choices": keys,
            "gold": doc["answer"],
Jonathan Tow's avatar
Jonathan Tow committed
157
        }
158
        return result
Andy Zou's avatar
Andy Zou committed
159

160
    def fewshot_examples(self, k, rnd):
Fabrizio Milo's avatar
Fabrizio Milo committed
161
        # fewshot_examples is not just sampling from train_docs because dev is
Leo Gao's avatar
Leo Gao committed
162
        # in the same distribution as val/test but auxiliary_train isn't
163
        if self._fewshot_docs is None:
Jon Tow's avatar
Jon Tow committed
164
            self._fewshot_docs = list(map(self._process_doc, self.dataset["dev"]))
165

166
        return self._fewshot_docs[:k]  # rnd.sample(list(self._fewshot_docs), k)
Andy Zou's avatar
Andy Zou committed
167

Jonathan Tow's avatar
Jonathan Tow committed
168
169
    def doc_to_text(self, doc):
        return doc["query"]
170
171
172
173
174
175

    def should_decontaminate(self):
        return True

    def doc_to_decontamination_query(self, doc):
        return doc["query"]