hendrycks_test.py 4.6 KB
Newer Older
Andy Zou's avatar
Andy Zou committed
1
import csv
Jonathan Tow's avatar
Jonathan Tow committed
2
3
import random
from lm_eval.base import MultipleChoiceTask
Andy Zou's avatar
Andy Zou committed
4
from ..utils import sh
Jonathan Tow's avatar
Jonathan Tow committed
5
from pathlib import Path
6
from best_download import download_file
Jonathan Tow's avatar
Jonathan Tow committed
7
8
9
10
11
12
13
14
15
16

SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
            'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
            'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
            'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
            'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics',
            'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics',
            'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence',
            'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes',
            'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
Andy Zou's avatar
Andy Zou committed
17
18
19
20
21
22
            'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']


def create_all_tasks():
    """Creates a dictionary of tasks from a list of subjects
    :return: {task_name: task}
Andy Zou's avatar
Andy Zou committed
23
        e.g. {hendrycksTest-abstract_algebra: Task, hendrycksTest-anatomy: Task}
Andy Zou's avatar
Andy Zou committed
24
25
    """
    return {
Andy Zou's avatar
Andy Zou committed
26
        f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS
Andy Zou's avatar
Andy Zou committed
27
28
    }

Jonathan Tow's avatar
Jonathan Tow committed
29

Andy Zou's avatar
Andy Zou committed
30
31
32
33
34
35
def create_task(subject):
    class HendrycksTest(GeneralHendrycksTest):
        def __init__(self):
            super().__init__(subject)
    return HendrycksTest

Jonathan Tow's avatar
Jonathan Tow committed
36

Andy Zou's avatar
Andy Zou committed
37
class GeneralHendrycksTest(MultipleChoiceTask):
Leo Gao's avatar
Leo Gao committed
38
    VERSION = 0
Jonathan Tow's avatar
Jonathan Tow committed
39
    DATASET_PATH = Path("data/hendrycksTest/")
Andy Zou's avatar
Andy Zou committed
40
41
42
43
44
45

    def __init__(self, subject):
        self.subject = subject
        super().__init__()

    def download(self):
46
47
        if not (self.DATASET_PATH / 'done').exists():
            sh("mkdir -p data")
48
            download_file("https://people.eecs.berkeley.edu/~hendrycks/data.tar", local_file="data/data.tar", expected_checksum="78a804365a59028188fb19bd1adcadc5e0c260b220a9d8b2e33a5ea7d5fbe3b4")
Andy Zou's avatar
Andy Zou committed
49
            sh("""
50
51
52
53
54
            tar -xf data/data.tar -C data/
            rm data/data.tar
            mv data/data data/hendrycksTest
            touch data/hendrycksTest/done
            """)
Andy Zou's avatar
Andy Zou committed
55
56

    def has_training_docs(self):
Andy Zou's avatar
Andy Zou committed
57
        return True
Andy Zou's avatar
Andy Zou committed
58
59

    def has_validation_docs(self):
60
        return True
Andy Zou's avatar
Andy Zou committed
61
62
63
64

    def has_test_docs(self):
        return True

Jonathan Tow's avatar
Jonathan Tow committed
65
    def _convert_standard(self, doc):
66
67
68
        def format_example(doc, choices):
            """
                Question: <prompt>
69
                Choices:
70
71
72
73
74
75
                A. <choice1>
                B. <choice2>
                C. <choice3>
                D. <choice4>
                Answer:
            """
76
            prompt = "Question: " + doc[0] + "\nChoices:\n"
77
78
            prompt += "".join([f"{choices[j]}. {doc[j+1]}\n" for j in range(4)])
            prompt += "Answer:"
79
80
            return prompt
        choices = ['A', 'B', 'C', 'D']
Jonathan Tow's avatar
Jonathan Tow committed
81
        return {
82
            "query": format_example(doc, choices),
Jonathan Tow's avatar
Jonathan Tow committed
83
            "choices": doc[1:5],
84
            "gold": choices.index(doc[5])
Jonathan Tow's avatar
Jonathan Tow committed
85
        }
Andy Zou's avatar
Andy Zou committed
86

Jonathan Tow's avatar
Jonathan Tow committed
87
    def _load_docs(self, filename):
Andy Zou's avatar
Andy Zou committed
88
        reader = csv.reader(open(filename, 'r'), quotechar='"', delimiter=',')
Jonathan Tow's avatar
Jonathan Tow committed
89
        return (self._convert_standard(doc) for doc in reader)
Andy Zou's avatar
Andy Zou committed
90
91

    def training_docs(self):
92
93
94
95
96
        docs = []
        for train_dir in ["auxiliary_train", "dev"]:
            for f in (self.DATASET_PATH / train_dir).iterdir():
                docs.extend(self._load_docs(f))
        return docs
Andy Zou's avatar
Andy Zou committed
97
98

    def validation_docs(self):
99
100
        filename = self.DATASET_PATH / "val" / f"{self.subject}_val.csv"
        return self._load_docs(filename)
Andy Zou's avatar
Andy Zou committed
101
102

    def test_docs(self):
Jonathan Tow's avatar
Jonathan Tow committed
103
        filename = self.DATASET_PATH / "test" / f"{self.subject}_test.csv"
Andy Zou's avatar
Andy Zou committed
104
        return self._load_docs(filename)
Andy Zou's avatar
Andy Zou committed
105

106
    def fewshot_examples(self, k, rnd):
Leo Gao's avatar
Leo Gao committed
107
108
        # fewshot_examples is not just sampling from train_docs because dev is 
        # in the same distribution as val/test but auxiliary_train isn't
109

Jonathan Tow's avatar
Jonathan Tow committed
110
        filename = self.DATASET_PATH / "dev" / f"{self.subject}_dev.csv"
111
112
113
114
115

        if self._fewshot_docs is None:
            self._fewshot_docs = list(self._load_docs(filename))

        return rnd.sample(list(self._fewshot_docs), k)
Andy Zou's avatar
Andy Zou committed
116

Jonathan Tow's avatar
Jonathan Tow committed
117
118
    def doc_to_text(self, doc):
        return doc["query"]