superglue.py 8.84 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
"""
2
3
4
5
6
7
8
9
10
SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems
https://w4ngatang.github.io/static/papers/superglue.pdf

SuperGLUE is a benchmark styled after GLUE with a new set of more difficult language
understanding tasks.

Homepage: https://super.gluebenchmark.com/

TODO: WSC requires free-form generation.
Jason Phang's avatar
Jason Phang committed
11
"""
Jason Phang's avatar
Jason Phang committed
12
import numpy as np
13
14
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
jon-tow's avatar
jon-tow committed
15
from lm_eval.base import rf, PromptSourceTask
Jonathan Tow's avatar
Jonathan Tow committed
16
17
from lm_eval.metrics import mean, acc_all, metric_max_over_ground_truths, yesno
from lm_eval.utils import general_detokenize
Jason Phang's avatar
Jason Phang committed
18

Jason Phang's avatar
Jason Phang committed
19

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
_CITATION = """
@inproceedings{NEURIPS2019_4496bf24,
    author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
    booktitle = {Advances in Neural Information Processing Systems},
    editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
    pages = {},
    publisher = {Curran Associates, Inc.},
    title = {SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
    url = {https://proceedings.neurips.cc/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf},
    volume = {32},
    year = {2019}
}
"""


jon-tow's avatar
jon-tow committed
35
class BoolQ(PromptSourceTask):
36
    VERSION = 1
Leo Gao's avatar
Leo Gao committed
37
38
    DATASET_PATH = "super_glue"
    DATASET_NAME = "boolq"
Jason Phang's avatar
Jason Phang committed
39
40
41
42
43
44
45
46

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
47
        return False
Jason Phang's avatar
Jason Phang committed
48

Jonathan Tow's avatar
Jonathan Tow committed
49
50
51
52
53
54
55
56
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

Jason Phang's avatar
Jason Phang committed
57

jon-tow's avatar
jon-tow committed
58
class CommitmentBank(PromptSourceTask):
thomasw21's avatar
thomasw21 committed
59
    VERSION = 1
Leo Gao's avatar
Leo Gao committed
60
61
    DATASET_PATH = "super_glue"
    DATASET_NAME = "cb"
Jason Phang's avatar
Jason Phang committed
62
63
64
65
66
67
68
69

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
70
        return False
Jason Phang's avatar
Jason Phang committed
71

Jonathan Tow's avatar
Jonathan Tow committed
72
73
74
75
76
77
78
79
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

thefazzer's avatar
thefazzer committed
80
81
    def process_results(self, doc, results):
        gold = doc["label"]
thefazzer's avatar
thefazzer committed
82
        pred = np.argmax(results)
83
84
85
        acc = 1.0 if pred == gold else 0.0

        return {"acc": acc, "f1": (pred, gold)}
Jason Phang's avatar
Jason Phang committed
86

thefazzer's avatar
thefazzer committed
87
    def higher_is_better(self):
88
        return {"acc": True, "f1": True}
Jason Phang's avatar
Jason Phang committed
89
90
91
92
93
94
95
96
97
98
99

    @classmethod
    def cb_multi_fi(cls, items):
        preds, golds = zip(*items)
        preds = np.array(preds)
        golds = np.array(golds)
        f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
        f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
        f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
        avg_f1 = mean([f11, f12, f13])
        return avg_f1
100

thefazzer's avatar
thefazzer committed
101
102
    def aggregation(self):
        return {
thefazzer's avatar
thefazzer committed
103
            "acc": mean,
Jason Phang's avatar
Jason Phang committed
104
            "f1": self.cb_multi_fi,
thefazzer's avatar
thefazzer committed
105
        }
Jason Phang's avatar
Jason Phang committed
106

Jason Phang's avatar
Jason Phang committed
107

jon-tow's avatar
jon-tow committed
108
class Copa(PromptSourceTask):
Leo Gao's avatar
Leo Gao committed
109
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
110
111
    DATASET_PATH = "super_glue"
    DATASET_NAME = "copa"
Jason Phang's avatar
Jason Phang committed
112
113
114
115
116
117
118
119

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
120
        return False
Jason Phang's avatar
Jason Phang committed
121

Jonathan Tow's avatar
Jonathan Tow committed
122
123
124
125
126
127
128
129
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

thefazzer's avatar
thefazzer committed
130
131
132
    def process_results(self, doc, results):
        gold = doc["label"]
        pred = np.argmax(results)
133
134
135
        acc = 1.0 if pred == gold else 0.0

        return {"acc": acc}
thefazzer's avatar
thefazzer committed
136
137

    def higher_is_better(self):
138
139
        return {"acc": True}

thefazzer's avatar
thefazzer committed
140
    def aggregation(self):
141
        return {"acc": mean}
Jason Phang's avatar
Jason Phang committed
142
143
144
145
146
147

    @staticmethod
    def convert_choice(choice):
        return choice[0].lower() + choice[1:]


jon-tow's avatar
jon-tow committed
148
class MultiRC(PromptSourceTask):
149
    VERSION = 1
Leo Gao's avatar
Leo Gao committed
150
151
    DATASET_PATH = "super_glue"
    DATASET_NAME = "multirc"
Jason Phang's avatar
multirc  
Jason Phang committed
152
153
154
155
156
157
158
159

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
160
        return False
Jason Phang's avatar
multirc  
Jason Phang committed
161

Jonathan Tow's avatar
Jonathan Tow committed
162
163
164
165
166
167
168
169
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

thefazzer's avatar
thefazzer committed
170
    def process_results(self, doc, results):
thomasw21's avatar
thomasw21 committed
171
172
        ll_true_choice, ll_false_choice = results
        pred = ll_true_choice > ll_false_choice
173
174
        return {"acc": (pred, doc)}

thefazzer's avatar
thefazzer committed
175
    def higher_is_better(self):
176
177
        return {"acc": True}

thefazzer's avatar
thefazzer committed
178
    def aggregation(self):
179
        return {"acc": acc_all}
Jason Phang's avatar
multirc  
Jason Phang committed
180

Jason Phang's avatar
Jason Phang committed
181

jon-tow's avatar
jon-tow committed
182
class ReCoRD(PromptSourceTask):
Leo Gao's avatar
Leo Gao committed
183
    VERSION = 0
Jason Phang's avatar
Jason Phang committed
184
185
186
187
188
189
190
191
192
193
    DATASET_PATH = "super_glue"
    DATASET_NAME = "record"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
Leo Gao's avatar
Leo Gao committed
194
        return False
Jason Phang's avatar
Jason Phang committed
195
196
197
198

    def training_docs(self):
        # In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
        # Each doc consists of multiple answer candidates, each of which is scored yes/no.
199
200
        if self._training_docs is None:
            self._training_docs = []
Jonathan Tow's avatar
Jonathan Tow committed
201
            for doc in self.dataset["train"]:
jon-tow's avatar
jon-tow committed
202
                self._training_docs.append(doc)
203
204
205
        return self._training_docs

    def validation_docs(self):
Jason Phang's avatar
Jason Phang committed
206
        # See: training_docs
Jonathan Tow's avatar
Jonathan Tow committed
207
        for doc in self.dataset["validation"]:
jon-tow's avatar
jon-tow committed
208
            yield doc
Jason Phang's avatar
Jason Phang committed
209
210
211
212
213
214

    def process_results(self, doc, results):
        # ReCoRD's evaluation is actually deceptively simple:
        # - Pick the maximum likelihood prediction entity
        # - Evaluate the accuracy and token F1 PER EXAMPLE
        # - Average over all examples
jon-tow's avatar
jon-tow committed
215
216

        # TODO (jon-tow): Look at result
Jason Phang's avatar
Jason Phang committed
217
        max_idx = np.argmax(np.array([result[0] for result in results]))
Leo Gao's avatar
Leo Gao committed
218

Jason Phang's avatar
Jason Phang committed
219
        prediction = doc["entities"][max_idx]
Jason Phang's avatar
Jason Phang committed
220
        gold_label_set = doc["answers"]
221
222
223
224
225
226
        f1 = metric_max_over_ground_truths(
            squad_metrics.compute_f1, prediction, gold_label_set
        )
        em = metric_max_over_ground_truths(
            squad_metrics.compute_exact, prediction, gold_label_set
        )
Jason Phang's avatar
Jason Phang committed
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245

        return {
            "f1": f1,
            "em": em,
        }

    def higher_is_better(self):
        return {
            "f1": True,
            "em": True,
        }

    def aggregation(self):
        return {
            "f1": mean,
            "em": mean,
        }


jon-tow's avatar
jon-tow committed
246
class WordsInContext(PromptSourceTask):
Leo Gao's avatar
Leo Gao committed
247
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
248
249
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wic"
Jason Phang's avatar
Jason Phang committed
250
251
252
253
254
255
256
257

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
258
        return False
Jason Phang's avatar
Jason Phang committed
259

Jonathan Tow's avatar
Jonathan Tow committed
260
261
262
263
264
265
266
267
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

Jason Phang's avatar
Jason Phang committed
268
    def higher_is_better(self):
269
        return {"acc": True}
Jason Phang's avatar
Jason Phang committed
270
271

    def aggregation(self):
272
        return {"acc": mean}
Jason Phang's avatar
Jason Phang committed
273
274


jon-tow's avatar
jon-tow committed
275
class SGWinogradSchemaChallenge(PromptSourceTask):
Leo Gao's avatar
Leo Gao committed
276
    VERSION = 0
Jason Phang's avatar
wsc  
Jason Phang committed
277
278
    # Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
    #       binary version of the task.
Leo Gao's avatar
Leo Gao committed
279
    DATASET_PATH = "super_glue"
jon-tow's avatar
jon-tow committed
280
    DATASET_NAME = "wsc.fixed"
Jason Phang's avatar
Jason Phang committed
281
282
283
284
285
286
287
288

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
289
        return False
Jason Phang's avatar
Jason Phang committed
290
291
292
293

    def training_docs(self):
        if self.has_training_docs():
            if self._training_docs is None:
Jason Phang's avatar
Jason Phang committed
294
                # GPT-3 Paper's format only uses positive examples for fewshot "training"
Jason Phang's avatar
Jason Phang committed
295
                self._training_docs = [
296
                    doc for doc in self.dataset["train"] if doc["label"]
Jason Phang's avatar
Jason Phang committed
297
298
299
                ]
            return self._training_docs

Jonathan Tow's avatar
Jonathan Tow committed
300
301
302
    def validation_docs(self):
        return self.dataset["validation"]

Leo Gao's avatar
Leo Gao committed
303
    def higher_is_better(self):
304
        return {"acc": True}
Jason Phang's avatar
Jason Phang committed
305
306

    def aggregation(self):
307
        return {"acc": mean}
jon-tow's avatar
jon-tow committed
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343


class WinogenderSchemaDiagnostics(PromptSourceTask):
    VERSION = 0
    DATASET_PATH = "super_glue"
    DATASET_NAME = "axg"

    def has_training_docs(self):
        return False

    def has_validation_docs(self):
        return False 

    def has_test_docs(self):
        return True 

    def test_docs(self):
        return self.dataset["test"]


class BroadcoverageDiagnostics(PromptSourceTask):
    VERSION = 0
    DATASET_PATH = "super_glue"
    DATASET_NAME = "axb"

    def has_training_docs(self):
        return False

    def has_validation_docs(self):
        return False 

    def has_test_docs(self):
        return True 

    def test_docs(self):
        return self.dataset["test"]