hendrycks_test.py 4.66 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
"""
Measuring Massive Multitask Language Understanding
https://arxiv.org/pdf/2009.03300.pdf

The Hendryck's Test is a benchmark that measured a text model’s multitask accuracy.
The test covers 57 tasks including elementary mathematics, US history, computer 
science, law, and more. To attain high accuracy on this test, models must possess
extensive world knowledge and problem solving ability. By comprehensively evaluating
the breadth and depth of a model’s academic and professional understanding, 
Hendryck's Test can be used to analyze models across many tasks and to identify 
important shortcomings.

Homepage: https://github.com/hendrycks/test
"""
Jonathan Tow's avatar
Jonathan Tow committed
15
16
from lm_eval.base import MultipleChoiceTask

17
18
19
20
21
22
23
24
25
26

_CITATION = """
@article{hendryckstest2021,
    title={Measuring Massive Multitask Language Understanding},
    author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
    journal={Proceedings of the International Conference on Learning Representations (ICLR)},
    year={2021}
}
"""

Jonathan Tow's avatar
Jonathan Tow committed
27
28
29
30
31
32
33
34
35
36

SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
            'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
            'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
            'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
            'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics',
            'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics',
            'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence',
            'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes',
            'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
Andy Zou's avatar
Andy Zou committed
37
38
39
40
41
42
            'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']


def create_all_tasks():
    """Creates a dictionary of tasks from a list of subjects
    :return: {task_name: task}
Andy Zou's avatar
Andy Zou committed
43
        e.g. {hendrycksTest-abstract_algebra: Task, hendrycksTest-anatomy: Task}
Andy Zou's avatar
Andy Zou committed
44
45
    """
    return {
Andy Zou's avatar
Andy Zou committed
46
        f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS
Andy Zou's avatar
Andy Zou committed
47
48
    }

Jonathan Tow's avatar
Jonathan Tow committed
49

Andy Zou's avatar
Andy Zou committed
50
51
52
53
54
55
def create_task(subject):
    class HendrycksTest(GeneralHendrycksTest):
        def __init__(self):
            super().__init__(subject)
    return HendrycksTest

Jonathan Tow's avatar
Jonathan Tow committed
56

Andy Zou's avatar
Andy Zou committed
57
class GeneralHendrycksTest(MultipleChoiceTask):
Leo Gao's avatar
Leo Gao committed
58
    VERSION = 0
Jonathan Tow's avatar
Jonathan Tow committed
59
60
    DATASET_PATH = "hendrycks_test"
    DATASET_NAME = None
Andy Zou's avatar
Andy Zou committed
61
62

    def __init__(self, subject):
Jonathan Tow's avatar
Jonathan Tow committed
63
        self.DATASET_NAME = subject
Andy Zou's avatar
Andy Zou committed
64
65
66
        super().__init__()

    def has_training_docs(self):
Jonathan Tow's avatar
Jonathan Tow committed
67
        return False
Andy Zou's avatar
Andy Zou committed
68
69

    def has_validation_docs(self):
70
        return True
Andy Zou's avatar
Andy Zou committed
71
72
73
74

    def has_test_docs(self):
        return True

Jonathan Tow's avatar
Jonathan Tow committed
75
    def validation_docs(self):
Jon Tow's avatar
Jon Tow committed
76
        return map(self._process_doc, self.dataset["validation"])
Jonathan Tow's avatar
Jonathan Tow committed
77
78

    def test_docs(self):
Jon Tow's avatar
Jon Tow committed
79
        return map(self._process_doc, self.dataset["test"])
Jonathan Tow's avatar
Jonathan Tow committed
80

Jon Tow's avatar
Jon Tow committed
81
    def _process_doc(self, doc):
Jonathan Tow's avatar
Jonathan Tow committed
82
        def format_example(doc, keys):
83
84
            """
                Question: <prompt>
85
                Choices:
86
87
88
89
90
91
                A. <choice1>
                B. <choice2>
                C. <choice3>
                D. <choice4>
                Answer:
            """
Jonathan Tow's avatar
Jonathan Tow committed
92
93
            prompt = "Question: " + doc["question"] + "\nChoices:\n"
            prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])])
94
            prompt += "Answer:"
95
            return prompt
Jonathan Tow's avatar
Jonathan Tow committed
96
        keys = ['A', 'B', 'C', 'D']
Jonathan Tow's avatar
Jonathan Tow committed
97
        return {
Jonathan Tow's avatar
Jonathan Tow committed
98
99
100
            "query": format_example(doc, keys),
            "choices": doc["choices"],
            "gold": keys.index(doc["answer"]) if isinstance(doc["answer"], str) else doc["answer"]
Jonathan Tow's avatar
Jonathan Tow committed
101
        }
Andy Zou's avatar
Andy Zou committed
102

103
    def fewshot_examples(self, k, rnd):
Leo Gao's avatar
Leo Gao committed
104
105
        # fewshot_examples is not just sampling from train_docs because dev is 
        # in the same distribution as val/test but auxiliary_train isn't
106
107

        if self._fewshot_docs is None:
Jon Tow's avatar
Jon Tow committed
108
            self._fewshot_docs = list(map(self._process_doc, self.dataset["dev"]))
109
110

        return rnd.sample(list(self._fewshot_docs), k)
Andy Zou's avatar
Andy Zou committed
111

Jonathan Tow's avatar
Jonathan Tow committed
112
113
    def doc_to_text(self, doc):
        return doc["query"]
114
115
116
117
118
119

    def should_decontaminate(self):
        return True

    def doc_to_decontamination_query(self, doc):
        return doc["query"]