hendrycks_test.py 5.56 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
"""
Measuring Massive Multitask Language Understanding
https://arxiv.org/pdf/2009.03300.pdf

The Hendryck's Test is a benchmark that measured a text model’s multitask accuracy.
The test covers 57 tasks including elementary mathematics, US history, computer 
science, law, and more. To attain high accuracy on this test, models must possess
extensive world knowledge and problem solving ability. By comprehensively evaluating
the breadth and depth of a model’s academic and professional understanding, 
Hendryck's Test can be used to analyze models across many tasks and to identify 
important shortcomings.

Homepage: https://github.com/hendrycks/test

@article{hendryckstest2021,
  title={Measuring Massive Multitask Language Understanding},
  author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
  journal={Proceedings of the International Conference on Learning Representations (ICLR)},
  year={2021}
}
"""
Andy Zou's avatar
Andy Zou committed
22
import csv
Jonathan Tow's avatar
Jonathan Tow committed
23
24
import random
from lm_eval.base import MultipleChoiceTask
Andy Zou's avatar
Andy Zou committed
25
from ..utils import sh
Jonathan Tow's avatar
Jonathan Tow committed
26
from pathlib import Path
27
from best_download import download_file
Jonathan Tow's avatar
Jonathan Tow committed
28
29
30
31
32
33
34
35
36
37

SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
            'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
            'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
            'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
            'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics',
            'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics',
            'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence',
            'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes',
            'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
Andy Zou's avatar
Andy Zou committed
38
39
40
41
42
43
            'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']


def create_all_tasks():
    """Creates a dictionary of tasks from a list of subjects
    :return: {task_name: task}
Andy Zou's avatar
Andy Zou committed
44
        e.g. {hendrycksTest-abstract_algebra: Task, hendrycksTest-anatomy: Task}
Andy Zou's avatar
Andy Zou committed
45
46
    """
    return {
Andy Zou's avatar
Andy Zou committed
47
        f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS
Andy Zou's avatar
Andy Zou committed
48
49
    }

Jonathan Tow's avatar
Jonathan Tow committed
50

Andy Zou's avatar
Andy Zou committed
51
52
53
54
55
56
def create_task(subject):
    class HendrycksTest(GeneralHendrycksTest):
        def __init__(self):
            super().__init__(subject)
    return HendrycksTest

Jonathan Tow's avatar
Jonathan Tow committed
57

Andy Zou's avatar
Andy Zou committed
58
class GeneralHendrycksTest(MultipleChoiceTask):
Leo Gao's avatar
Leo Gao committed
59
    VERSION = 0
Jonathan Tow's avatar
Jonathan Tow committed
60
    DATASET_PATH = Path("data/hendrycksTest/")
Andy Zou's avatar
Andy Zou committed
61
62
63
64
65
66

    def __init__(self, subject):
        self.subject = subject
        super().__init__()

    def download(self):
67
68
        if not (self.DATASET_PATH / 'done').exists():
            sh("mkdir -p data")
69
            download_file("https://people.eecs.berkeley.edu/~hendrycks/data.tar", local_file="data/data.tar", expected_checksum="78a804365a59028188fb19bd1adcadc5e0c260b220a9d8b2e33a5ea7d5fbe3b4")
Andy Zou's avatar
Andy Zou committed
70
            sh("""
71
72
73
74
75
            tar -xf data/data.tar -C data/
            rm data/data.tar
            mv data/data data/hendrycksTest
            touch data/hendrycksTest/done
            """)
Andy Zou's avatar
Andy Zou committed
76
77

    def has_training_docs(self):
Andy Zou's avatar
Andy Zou committed
78
        return True
Andy Zou's avatar
Andy Zou committed
79
80

    def has_validation_docs(self):
81
        return True
Andy Zou's avatar
Andy Zou committed
82
83
84
85

    def has_test_docs(self):
        return True

Jonathan Tow's avatar
Jonathan Tow committed
86
    def _convert_standard(self, doc):
87
88
89
        def format_example(doc, choices):
            """
                Question: <prompt>
90
                Choices:
91
92
93
94
95
96
                A. <choice1>
                B. <choice2>
                C. <choice3>
                D. <choice4>
                Answer:
            """
97
            prompt = "Question: " + doc[0] + "\nChoices:\n"
98
99
            prompt += "".join([f"{choices[j]}. {doc[j+1]}\n" for j in range(4)])
            prompt += "Answer:"
100
101
            return prompt
        choices = ['A', 'B', 'C', 'D']
Jonathan Tow's avatar
Jonathan Tow committed
102
        return {
103
            "query": format_example(doc, choices),
Jonathan Tow's avatar
Jonathan Tow committed
104
            "choices": doc[1:5],
105
            "gold": choices.index(doc[5])
Jonathan Tow's avatar
Jonathan Tow committed
106
        }
Andy Zou's avatar
Andy Zou committed
107

Jonathan Tow's avatar
Jonathan Tow committed
108
    def _load_docs(self, filename):
Andy Zou's avatar
Andy Zou committed
109
        reader = csv.reader(open(filename, 'r'), quotechar='"', delimiter=',')
Jonathan Tow's avatar
Jonathan Tow committed
110
        return (self._convert_standard(doc) for doc in reader)
Andy Zou's avatar
Andy Zou committed
111
112

    def training_docs(self):
113
114
115
116
117
        docs = []
        for train_dir in ["auxiliary_train", "dev"]:
            for f in (self.DATASET_PATH / train_dir).iterdir():
                docs.extend(self._load_docs(f))
        return docs
Andy Zou's avatar
Andy Zou committed
118
119

    def validation_docs(self):
120
121
        filename = self.DATASET_PATH / "val" / f"{self.subject}_val.csv"
        return self._load_docs(filename)
Andy Zou's avatar
Andy Zou committed
122
123

    def test_docs(self):
Jonathan Tow's avatar
Jonathan Tow committed
124
        filename = self.DATASET_PATH / "test" / f"{self.subject}_test.csv"
Andy Zou's avatar
Andy Zou committed
125
        return self._load_docs(filename)
Andy Zou's avatar
Andy Zou committed
126

127
    def fewshot_examples(self, k, rnd):
Leo Gao's avatar
Leo Gao committed
128
129
        # fewshot_examples is not just sampling from train_docs because dev is 
        # in the same distribution as val/test but auxiliary_train isn't
130

Jonathan Tow's avatar
Jonathan Tow committed
131
        filename = self.DATASET_PATH / "dev" / f"{self.subject}_dev.csv"
132
133
134
135
136

        if self._fewshot_docs is None:
            self._fewshot_docs = list(self._load_docs(filename))

        return rnd.sample(list(self._fewshot_docs), k)
Andy Zou's avatar
Andy Zou committed
137

Jonathan Tow's avatar
Jonathan Tow committed
138
139
    def doc_to_text(self, doc):
        return doc["query"]