hendrycks_test.py 4.79 KB
Newer Older
1
2
3
4
5
"""
Measuring Massive Multitask Language Understanding
https://arxiv.org/pdf/2009.03300.pdf

The Hendryck's Test is a benchmark that measured a text model’s multitask accuracy.
bzantium's avatar
bzantium committed
6
The test covers 57 tasks including elementary mathematics, US history, computer
7
8
science, law, and more. To attain high accuracy on this test, models must possess
extensive world knowledge and problem solving ability. By comprehensively evaluating
bzantium's avatar
bzantium committed
9
10
the breadth and depth of a model’s academic and professional understanding,
Hendryck's Test can be used to analyze models across many tasks and to identify
11
12
13
14
important shortcomings.

Homepage: https://github.com/hendrycks/test
"""
Jonathan Tow's avatar
Jonathan Tow committed
15
16
from lm_eval.base import MultipleChoiceTask

17
18
19
20
21
22
23
24
25
26
27

_CITATION = """
@article{hendryckstest2021,
    title={Measuring Massive Multitask Language Understanding},
    author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
    journal={Proceedings of the International Conference on Learning Representations (ICLR)},
    year={2021}
}
"""


bzantium's avatar
bzantium committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
SUBJECTS = [
    "abstract_algebra",
    "anatomy",
    "astronomy",
    "business_ethics",
    "clinical_knowledge",
    "college_biology",
    "college_chemistry",
    "college_computer_science",
    "college_mathematics",
    "college_medicine",
    "college_physics",
    "computer_security",
    "conceptual_physics",
    "econometrics",
    "electrical_engineering",
    "elementary_mathematics",
    "formal_logic",
    "global_facts",
    "high_school_biology",
    "high_school_chemistry",
    "high_school_computer_science",
    "high_school_european_history",
    "high_school_geography",
    "high_school_government_and_politics",
    "high_school_macroeconomics",
    "high_school_mathematics",
    "high_school_microeconomics",
    "high_school_physics",
    "high_school_psychology",
    "high_school_statistics",
    "high_school_us_history",
    "high_school_world_history",
    "human_aging",
    "human_sexuality",
    "international_law",
    "jurisprudence",
    "logical_fallacies",
    "machine_learning",
    "management",
    "marketing",
    "medical_genetics",
    "miscellaneous",
    "moral_disputes",
    "moral_scenarios",
    "nutrition",
    "philosophy",
    "prehistory",
    "professional_accounting",
    "professional_law",
    "professional_medicine",
    "professional_psychology",
    "public_relations",
    "security_studies",
    "sociology",
    "us_foreign_policy",
    "virology",
    "world_religions",
]
Andy Zou's avatar
Andy Zou committed
87
88
89
90
91


def create_all_tasks():
    """Creates a dictionary of tasks from a list of subjects
    :return: {task_name: task}
Andy Zou's avatar
Andy Zou committed
92
        e.g. {hendrycksTest-abstract_algebra: Task, hendrycksTest-anatomy: Task}
Andy Zou's avatar
Andy Zou committed
93
    """
bzantium's avatar
bzantium committed
94
    return {f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS}
Andy Zou's avatar
Andy Zou committed
95

Jonathan Tow's avatar
Jonathan Tow committed
96

Andy Zou's avatar
Andy Zou committed
97
98
99
100
def create_task(subject):
    class HendrycksTest(GeneralHendrycksTest):
        def __init__(self):
            super().__init__(subject)
bzantium's avatar
bzantium committed
101

Andy Zou's avatar
Andy Zou committed
102
103
    return HendrycksTest

Jonathan Tow's avatar
Jonathan Tow committed
104

Andy Zou's avatar
Andy Zou committed
105
class GeneralHendrycksTest(MultipleChoiceTask):
Leo Gao's avatar
Leo Gao committed
106
    VERSION = 0
Jonathan Tow's avatar
Jonathan Tow committed
107
108
    DATASET_PATH = "hendrycks_test"
    DATASET_NAME = None
Andy Zou's avatar
Andy Zou committed
109
110

    def __init__(self, subject):
Jonathan Tow's avatar
Jonathan Tow committed
111
        self.DATASET_NAME = subject
Andy Zou's avatar
Andy Zou committed
112
113
114
        super().__init__()

    def has_training_docs(self):
Jonathan Tow's avatar
Jonathan Tow committed
115
        return False
Andy Zou's avatar
Andy Zou committed
116
117

    def has_validation_docs(self):
118
        return True
Andy Zou's avatar
Andy Zou committed
119
120
121
122

    def has_test_docs(self):
        return True

Jonathan Tow's avatar
Jonathan Tow committed
123
    def validation_docs(self):
Jon Tow's avatar
Jon Tow committed
124
        return map(self._process_doc, self.dataset["validation"])
Jonathan Tow's avatar
Jonathan Tow committed
125
126

    def test_docs(self):
Jon Tow's avatar
Jon Tow committed
127
        return map(self._process_doc, self.dataset["test"])
Jonathan Tow's avatar
Jonathan Tow committed
128

Jon Tow's avatar
Jon Tow committed
129
    def _process_doc(self, doc):
Jonathan Tow's avatar
Jonathan Tow committed
130
        def format_example(doc, keys):
131
            """
bzantium's avatar
bzantium committed
132
133
134
135
136
137
138
            Question: <prompt>
            Choices:
            A. <choice1>
            B. <choice2>
            C. <choice3>
            D. <choice4>
            Answer:
139
            """
Jonathan Tow's avatar
Jonathan Tow committed
140
            prompt = "Question: " + doc["question"] + "\nChoices:\n"
bzantium's avatar
bzantium committed
141
142
143
            prompt += "".join(
                [f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])]
            )
144
            prompt += "Answer:"
145
            return prompt
bzantium's avatar
bzantium committed
146
147

        keys = ["A", "B", "C", "D"]
Jonathan Tow's avatar
Jonathan Tow committed
148
        return {
Jonathan Tow's avatar
Jonathan Tow committed
149
150
            "query": format_example(doc, keys),
            "choices": doc["choices"],
bzantium's avatar
bzantium committed
151
152
153
            "gold": keys.index(doc["answer"])
            if isinstance(doc["answer"], str)
            else doc["answer"],
Jonathan Tow's avatar
Jonathan Tow committed
154
        }
Andy Zou's avatar
Andy Zou committed
155

156
    def fewshot_examples(self, k, rnd):
bzantium's avatar
bzantium committed
157
        # fewshot_examples is not just sampling from train_docs because dev is
Leo Gao's avatar
Leo Gao committed
158
        # in the same distribution as val/test but auxiliary_train isn't
159
160

        if self._fewshot_docs is None:
Jon Tow's avatar
Jon Tow committed
161
            self._fewshot_docs = list(map(self._process_doc, self.dataset["dev"]))
162
163

        return rnd.sample(list(self._fewshot_docs), k)
Andy Zou's avatar
Andy Zou committed
164

Jonathan Tow's avatar
Jonathan Tow committed
165
166
    def doc_to_text(self, doc):
        return doc["query"]
bzantium's avatar
bzantium committed
167
168
169
170
171
172

    def should_decontaminate(self):
        return True

    def doc_to_decontamination_query(self, doc):
        return doc["query"]