superglue.py 8.15 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
"""
2
3
4
5
6
7
8
9
10
SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems
https://w4ngatang.github.io/static/papers/superglue.pdf

SuperGLUE is a benchmark styled after GLUE with a new set of more difficult language
understanding tasks.

Homepage: https://super.gluebenchmark.com/

TODO: WSC requires free-form generation.
Jason Phang's avatar
Jason Phang committed
11
"""
Jason Phang's avatar
Jason Phang committed
12
import numpy as np
13
14
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
jon-tow's avatar
jon-tow committed
15
from lm_eval.base import rf, PromptSourceTask
Jonathan Tow's avatar
Jonathan Tow committed
16
17
from lm_eval.metrics import mean, acc_all, metric_max_over_ground_truths, yesno
from lm_eval.utils import general_detokenize
Jason Phang's avatar
Jason Phang committed
18

Jason Phang's avatar
Jason Phang committed
19

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
_CITATION = """
@inproceedings{NEURIPS2019_4496bf24,
    author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
    booktitle = {Advances in Neural Information Processing Systems},
    editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
    pages = {},
    publisher = {Curran Associates, Inc.},
    title = {SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
    url = {https://proceedings.neurips.cc/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf},
    volume = {32},
    year = {2019}
}
"""


jon-tow's avatar
jon-tow committed
35
class BoolQ(PromptSourceTask):
36
    VERSION = 1
Leo Gao's avatar
Leo Gao committed
37
38
    DATASET_PATH = "super_glue"
    DATASET_NAME = "boolq"
Jason Phang's avatar
Jason Phang committed
39
40
41
42
43
44
45
46

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
47
        return False
Jason Phang's avatar
Jason Phang committed
48

Jonathan Tow's avatar
Jonathan Tow committed
49
50
51
52
53
54
55
56
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

Jason Phang's avatar
Jason Phang committed
57

jon-tow's avatar
jon-tow committed
58
class CommitmentBank(PromptSourceTask):
thomasw21's avatar
thomasw21 committed
59
    VERSION = 1
Leo Gao's avatar
Leo Gao committed
60
61
    DATASET_PATH = "super_glue"
    DATASET_NAME = "cb"
Jason Phang's avatar
Jason Phang committed
62
63
64
65
66
67
68
69

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
70
        return False
Jason Phang's avatar
Jason Phang committed
71

Jonathan Tow's avatar
Jonathan Tow committed
72
73
74
75
76
77
78
79
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

thefazzer's avatar
thefazzer committed
80
81
    def process_results(self, doc, results):
        gold = doc["label"]
thefazzer's avatar
thefazzer committed
82
        pred = np.argmax(results)
83
84
85
        acc = 1.0 if pred == gold else 0.0

        return {"acc": acc, "f1": (pred, gold)}
Jason Phang's avatar
Jason Phang committed
86

thefazzer's avatar
thefazzer committed
87
    def higher_is_better(self):
88
        return {"acc": True, "f1": True}
Jason Phang's avatar
Jason Phang committed
89
90
91
92
93
94
95
96
97
98
99

    @classmethod
    def cb_multi_fi(cls, items):
        preds, golds = zip(*items)
        preds = np.array(preds)
        golds = np.array(golds)
        f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
        f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
        f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
        avg_f1 = mean([f11, f12, f13])
        return avg_f1
100

thefazzer's avatar
thefazzer committed
101
102
    def aggregation(self):
        return {
thefazzer's avatar
thefazzer committed
103
            "acc": mean,
Jason Phang's avatar
Jason Phang committed
104
            "f1": self.cb_multi_fi,
thefazzer's avatar
thefazzer committed
105
        }
Jason Phang's avatar
Jason Phang committed
106

Jason Phang's avatar
Jason Phang committed
107

jon-tow's avatar
jon-tow committed
108
class Copa(PromptSourceTask):
Leo Gao's avatar
Leo Gao committed
109
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
110
111
    DATASET_PATH = "super_glue"
    DATASET_NAME = "copa"
Jason Phang's avatar
Jason Phang committed
112
113
114
115
116
117
118
119

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
120
        return False
Jason Phang's avatar
Jason Phang committed
121

Jonathan Tow's avatar
Jonathan Tow committed
122
123
124
125
126
127
128
129
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

thefazzer's avatar
thefazzer committed
130
131
132
    def process_results(self, doc, results):
        gold = doc["label"]
        pred = np.argmax(results)
133
134
135
        acc = 1.0 if pred == gold else 0.0

        return {"acc": acc}
thefazzer's avatar
thefazzer committed
136
137

    def higher_is_better(self):
138
139
        return {"acc": True}

thefazzer's avatar
thefazzer committed
140
    def aggregation(self):
141
        return {"acc": mean}
Jason Phang's avatar
Jason Phang committed
142
143
144
145
146
147

    @staticmethod
    def convert_choice(choice):
        return choice[0].lower() + choice[1:]


jon-tow's avatar
jon-tow committed
148
class MultiRC(PromptSourceTask):
149
    VERSION = 1
Leo Gao's avatar
Leo Gao committed
150
151
    DATASET_PATH = "super_glue"
    DATASET_NAME = "multirc"
Jason Phang's avatar
multirc  
Jason Phang committed
152
153
154
155
156
157
158
159

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
160
        return False
Jason Phang's avatar
multirc  
Jason Phang committed
161

Jonathan Tow's avatar
Jonathan Tow committed
162
163
164
165
166
167
168
169
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

thefazzer's avatar
thefazzer committed
170
    def process_results(self, doc, results):
thomasw21's avatar
thomasw21 committed
171
172
        ll_true_choice, ll_false_choice = results
        pred = ll_true_choice > ll_false_choice
173
174
        return {"acc": (pred, doc)}

thefazzer's avatar
thefazzer committed
175
    def higher_is_better(self):
176
177
        return {"acc": True}

thefazzer's avatar
thefazzer committed
178
    def aggregation(self):
179
        return {"acc": acc_all}
Jason Phang's avatar
multirc  
Jason Phang committed
180

Jason Phang's avatar
Jason Phang committed
181

jon-tow's avatar
jon-tow committed
182
class ReCoRD(PromptSourceTask):
Leo Gao's avatar
Leo Gao committed
183
    VERSION = 0
Jason Phang's avatar
Jason Phang committed
184
185
186
187
188
189
190
191
192
193
    DATASET_PATH = "super_glue"
    DATASET_NAME = "record"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
Leo Gao's avatar
Leo Gao committed
194
        return False
Jason Phang's avatar
Jason Phang committed
195
196
197
198

    def training_docs(self):
        # In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
        # Each doc consists of multiple answer candidates, each of which is scored yes/no.
199
200
        if self._training_docs is None:
            self._training_docs = []
Jonathan Tow's avatar
Jonathan Tow committed
201
            for doc in self.dataset["train"]:
jon-tow's avatar
jon-tow committed
202
                self._training_docs.append(doc)
203
204
205
        return self._training_docs

    def validation_docs(self):
Jason Phang's avatar
Jason Phang committed
206
        # See: training_docs
Jonathan Tow's avatar
Jonathan Tow committed
207
        for doc in self.dataset["validation"]:
jon-tow's avatar
jon-tow committed
208
            yield doc
Jason Phang's avatar
Jason Phang committed
209
210
211
212
213
214

    def process_results(self, doc, results):
        # ReCoRD's evaluation is actually deceptively simple:
        # - Pick the maximum likelihood prediction entity
        # - Evaluate the accuracy and token F1 PER EXAMPLE
        # - Average over all examples
jon-tow's avatar
jon-tow committed
215
216

        # TODO (jon-tow): Look at result
Jason Phang's avatar
Jason Phang committed
217
        max_idx = np.argmax(np.array([result[0] for result in results]))
Leo Gao's avatar
Leo Gao committed
218

Jason Phang's avatar
Jason Phang committed
219
        prediction = doc["entities"][max_idx]
Jason Phang's avatar
Jason Phang committed
220
        gold_label_set = doc["answers"]
221
222
223
224
225
226
        f1 = metric_max_over_ground_truths(
            squad_metrics.compute_f1, prediction, gold_label_set
        )
        em = metric_max_over_ground_truths(
            squad_metrics.compute_exact, prediction, gold_label_set
        )
Jason Phang's avatar
Jason Phang committed
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245

        return {
            "f1": f1,
            "em": em,
        }

    def higher_is_better(self):
        return {
            "f1": True,
            "em": True,
        }

    def aggregation(self):
        return {
            "f1": mean,
            "em": mean,
        }


jon-tow's avatar
jon-tow committed
246
class WordsInContext(PromptSourceTask):
Leo Gao's avatar
Leo Gao committed
247
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
248
249
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wic"
Jason Phang's avatar
Jason Phang committed
250
251
252
253
254
255
256
257

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
258
        return False
Jason Phang's avatar
Jason Phang committed
259

Jonathan Tow's avatar
Jonathan Tow committed
260
261
262
263
264
265
266
267
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

Jason Phang's avatar
Jason Phang committed
268
    def higher_is_better(self):
269
        return {"acc": True}
Jason Phang's avatar
Jason Phang committed
270
271

    def aggregation(self):
272
        return {"acc": mean}
Jason Phang's avatar
Jason Phang committed
273
274


jon-tow's avatar
jon-tow committed
275
class SGWinogradSchemaChallenge(PromptSourceTask):
Leo Gao's avatar
Leo Gao committed
276
    VERSION = 0
Jason Phang's avatar
wsc  
Jason Phang committed
277
278
    # Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
    #       binary version of the task.
Leo Gao's avatar
Leo Gao committed
279
    DATASET_PATH = "super_glue"
jon-tow's avatar
jon-tow committed
280
    DATASET_NAME = "wsc.fixed"
Jason Phang's avatar
Jason Phang committed
281
282
283
284
285
286
287
288

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
289
        return False
Jason Phang's avatar
Jason Phang committed
290
291
292
293

    def training_docs(self):
        if self.has_training_docs():
            if self._training_docs is None:
Jason Phang's avatar
Jason Phang committed
294
                # GPT-3 Paper's format only uses positive examples for fewshot "training"
Jason Phang's avatar
Jason Phang committed
295
                self._training_docs = [
296
                    doc for doc in self.dataset["train"] if doc["label"]
Jason Phang's avatar
Jason Phang committed
297
298
299
                ]
            return self._training_docs

Jonathan Tow's avatar
Jonathan Tow committed
300
301
302
    def validation_docs(self):
        return self.dataset["validation"]

Leo Gao's avatar
Leo Gao committed
303
    def higher_is_better(self):
304
        return {"acc": True}
Jason Phang's avatar
Jason Phang committed
305
306

    def aggregation(self):
307
        return {"acc": mean}