"megatron/text_generation/forward_step.py" did not exist on "8f160844f93dc80272ea1eba4aae441577c24907"
hendrycks_test.py 5.59 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
"""
Measuring Massive Multitask Language Understanding
https://arxiv.org/pdf/2009.03300.pdf

The Hendryck's Test is a benchmark that measured a text model’s multitask accuracy.
The test covers 57 tasks including elementary mathematics, US history, computer 
science, law, and more. To attain high accuracy on this test, models must possess
extensive world knowledge and problem solving ability. By comprehensively evaluating
the breadth and depth of a model’s academic and professional understanding, 
Hendryck's Test can be used to analyze models across many tasks and to identify 
important shortcomings.

Homepage: https://github.com/hendrycks/test
"""
Andy Zou's avatar
Andy Zou committed
15
import csv
Jonathan Tow's avatar
Jonathan Tow committed
16
17
import random
from lm_eval.base import MultipleChoiceTask
Andy Zou's avatar
Andy Zou committed
18
from ..utils import sh
Jonathan Tow's avatar
Jonathan Tow committed
19
from pathlib import Path
20
from best_download import download_file
Jonathan Tow's avatar
Jonathan Tow committed
21

22
23
24
25
26
27
28
29
30
31
32

_CITATION = """
@article{hendryckstest2021,
    title={Measuring Massive Multitask Language Understanding},
    author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
    journal={Proceedings of the International Conference on Learning Representations (ICLR)},
    year={2021}
}
"""


Jonathan Tow's avatar
Jonathan Tow committed
33
34
35
36
37
38
39
40
41
SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
            'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
            'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
            'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
            'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics',
            'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics',
            'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence',
            'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes',
            'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
Andy Zou's avatar
Andy Zou committed
42
43
44
45
46
47
            'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']


def create_all_tasks():
    """Creates a dictionary of tasks from a list of subjects
    :return: {task_name: task}
Andy Zou's avatar
Andy Zou committed
48
        e.g. {hendrycksTest-abstract_algebra: Task, hendrycksTest-anatomy: Task}
Andy Zou's avatar
Andy Zou committed
49
50
    """
    return {
Andy Zou's avatar
Andy Zou committed
51
        f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS
Andy Zou's avatar
Andy Zou committed
52
53
    }

Jonathan Tow's avatar
Jonathan Tow committed
54

Andy Zou's avatar
Andy Zou committed
55
56
57
58
59
60
def create_task(subject):
    class HendrycksTest(GeneralHendrycksTest):
        def __init__(self):
            super().__init__(subject)
    return HendrycksTest

Jonathan Tow's avatar
Jonathan Tow committed
61

Andy Zou's avatar
Andy Zou committed
62
class GeneralHendrycksTest(MultipleChoiceTask):
Leo Gao's avatar
Leo Gao committed
63
    VERSION = 0
Jonathan Tow's avatar
Jonathan Tow committed
64
    DATASET_PATH = Path("data/hendrycksTest/")
Andy Zou's avatar
Andy Zou committed
65
66
67
68
69
70

    def __init__(self, subject):
        self.subject = subject
        super().__init__()

    def download(self):
71
72
        if not (self.DATASET_PATH / 'done').exists():
            sh("mkdir -p data")
73
            download_file("https://people.eecs.berkeley.edu/~hendrycks/data.tar", local_file="data/data.tar", expected_checksum="78a804365a59028188fb19bd1adcadc5e0c260b220a9d8b2e33a5ea7d5fbe3b4")
Andy Zou's avatar
Andy Zou committed
74
            sh("""
75
76
77
78
79
            tar -xf data/data.tar -C data/
            rm data/data.tar
            mv data/data data/hendrycksTest
            touch data/hendrycksTest/done
            """)
Andy Zou's avatar
Andy Zou committed
80
81

    def has_training_docs(self):
Andy Zou's avatar
Andy Zou committed
82
        return True
Andy Zou's avatar
Andy Zou committed
83
84

    def has_validation_docs(self):
85
        return True
Andy Zou's avatar
Andy Zou committed
86
87
88
89

    def has_test_docs(self):
        return True

Jonathan Tow's avatar
Jonathan Tow committed
90
    def _convert_standard(self, doc):
91
92
93
        def format_example(doc, choices):
            """
                Question: <prompt>
94
                Choices:
95
96
97
98
99
100
                A. <choice1>
                B. <choice2>
                C. <choice3>
                D. <choice4>
                Answer:
            """
101
            prompt = "Question: " + doc[0] + "\nChoices:\n"
102
103
            prompt += "".join([f"{choices[j]}. {doc[j+1]}\n" for j in range(4)])
            prompt += "Answer:"
104
105
            return prompt
        choices = ['A', 'B', 'C', 'D']
Jonathan Tow's avatar
Jonathan Tow committed
106
        return {
107
            "query": format_example(doc, choices),
Jonathan Tow's avatar
Jonathan Tow committed
108
            "choices": doc[1:5],
109
            "gold": choices.index(doc[5])
Jonathan Tow's avatar
Jonathan Tow committed
110
        }
Andy Zou's avatar
Andy Zou committed
111

Jonathan Tow's avatar
Jonathan Tow committed
112
    def _load_docs(self, filename):
Andy Zou's avatar
Andy Zou committed
113
        reader = csv.reader(open(filename, 'r'), quotechar='"', delimiter=',')
Jonathan Tow's avatar
Jonathan Tow committed
114
        return (self._convert_standard(doc) for doc in reader)
Andy Zou's avatar
Andy Zou committed
115
116

    def training_docs(self):
117
118
119
120
121
        docs = []
        for train_dir in ["auxiliary_train", "dev"]:
            for f in (self.DATASET_PATH / train_dir).iterdir():
                docs.extend(self._load_docs(f))
        return docs
Andy Zou's avatar
Andy Zou committed
122
123

    def validation_docs(self):
124
125
        filename = self.DATASET_PATH / "val" / f"{self.subject}_val.csv"
        return self._load_docs(filename)
Andy Zou's avatar
Andy Zou committed
126
127

    def test_docs(self):
Jonathan Tow's avatar
Jonathan Tow committed
128
        filename = self.DATASET_PATH / "test" / f"{self.subject}_test.csv"
Andy Zou's avatar
Andy Zou committed
129
        return self._load_docs(filename)
Andy Zou's avatar
Andy Zou committed
130

131
    def fewshot_examples(self, k, rnd):
Leo Gao's avatar
Leo Gao committed
132
133
        # fewshot_examples is not just sampling from train_docs because dev is 
        # in the same distribution as val/test but auxiliary_train isn't
134

Jonathan Tow's avatar
Jonathan Tow committed
135
        filename = self.DATASET_PATH / "dev" / f"{self.subject}_dev.csv"
136
137
138
139
140

        if self._fewshot_docs is None:
            self._fewshot_docs = list(self._load_docs(filename))

        return rnd.sample(list(self._fewshot_docs), k)
Andy Zou's avatar
Andy Zou committed
141

Jonathan Tow's avatar
Jonathan Tow committed
142
143
    def doc_to_text(self, doc):
        return doc["query"]