superglue.py 13.8 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
"""
2
3
4
5
6
7
8
9
10
SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems
https://w4ngatang.github.io/static/papers/superglue.pdf

SuperGLUE is a benchmark styled after GLUE with a new set of more difficult language
understanding tasks.

Homepage: https://super.gluebenchmark.com/

TODO: WSC requires free-form generation.
Jason Phang's avatar
Jason Phang committed
11
"""
Jason Phang's avatar
Jason Phang committed
12
import numpy as np
13
14
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
Jonathan Tow's avatar
Jonathan Tow committed
15
16
17
from lm_eval.base import rf, Task
from lm_eval.metrics import mean, acc_all, metric_max_over_ground_truths, yesno
from lm_eval.utils import general_detokenize
Jason Phang's avatar
Jason Phang committed
18

Jason Phang's avatar
Jason Phang committed
19

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
_CITATION = """
@inproceedings{NEURIPS2019_4496bf24,
    author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
    booktitle = {Advances in Neural Information Processing Systems},
    editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
    pages = {},
    publisher = {Curran Associates, Inc.},
    title = {SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
    url = {https://proceedings.neurips.cc/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf},
    volume = {32},
    year = {2019}
}
"""


Jonathan Tow's avatar
Jonathan Tow committed
35
class BoolQ(Task):
36
    VERSION = 1
Leo Gao's avatar
Leo Gao committed
37
38
    DATASET_PATH = "super_glue"
    DATASET_NAME = "boolq"
Jason Phang's avatar
Jason Phang committed
39
40
41
42
43
44
45
46

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
47
        return False
Jason Phang's avatar
Jason Phang committed
48

Jonathan Tow's avatar
Jonathan Tow committed
49
50
51
52
53
54
55
56
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

Leo Gao's avatar
Update  
Leo Gao committed
57
    def doc_to_text(self, doc):
58
        return f"{doc['passage']}\nQuestion: {doc['question']}?\nAnswer:"
bzantium's avatar
bzantium committed
59
60
61
62
63
64
65

    def should_decontaminate(self):
        return True

    def doc_to_decontamination_query(self, doc):
        return doc["passage"]

Leo Gao's avatar
Update  
Leo Gao committed
66
    def doc_to_target(self, doc):
bzantium's avatar
bzantium committed
67
        return " " + yesno(doc["label"])
Jason Phang's avatar
Jason Phang committed
68

69
    def construct_requests(self, doc, ctx):
Leo Gao's avatar
Update  
Leo Gao committed
70

bzantium's avatar
bzantium committed
71
72
        ll_yes, _ = rf.loglikelihood(ctx, " yes")
        ll_no, _ = rf.loglikelihood(ctx, " no")
Leo Gao's avatar
Update  
Leo Gao committed
73
74
75
76
77
78
79

        return ll_yes, ll_no

    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]

bzantium's avatar
bzantium committed
80
81
82
        acc = 1.0 if (ll_yes > ll_no) == gold else 0.0

        return {"acc": acc}
Leo Gao's avatar
Update  
Leo Gao committed
83

84
    def higher_is_better(self):
bzantium's avatar
bzantium committed
85
86
        return {"acc": True}

87
    def aggregation(self):
bzantium's avatar
bzantium committed
88
        return {"acc": mean}
Jason Phang's avatar
Jason Phang committed
89

Jason Phang's avatar
Jason Phang committed
90

Jonathan Tow's avatar
Jonathan Tow committed
91
class CommitmentBank(Task):
thomasw21's avatar
thomasw21 committed
92
    VERSION = 1
Leo Gao's avatar
Leo Gao committed
93
94
    DATASET_PATH = "super_glue"
    DATASET_NAME = "cb"
Jason Phang's avatar
Jason Phang committed
95
96
97
98
99
100
101
102

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
103
        return False
Jason Phang's avatar
Jason Phang committed
104

Jonathan Tow's avatar
Jonathan Tow committed
105
106
107
108
109
110
111
112
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

113
    def doc_to_text(self, doc):
Leo Gao's avatar
Leo Gao committed
114
        return "{}\nQuestion: {}. True, False or Neither?\nAnswer:".format(
Jason Phang's avatar
Jason Phang committed
115
116
117
            doc["premise"],
            doc["hypothesis"],
        )
118

thefazzer's avatar
thefazzer committed
119
    def doc_to_target(self, doc):
120
121
122
        # True = entailment
        # False = contradiction
        # Neither = neutral
thomasw21's avatar
Fix CB  
thomasw21 committed
123
        return " {}".format({0: "True", 1: "False", 2: "Neither"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
124

thefazzer's avatar
thefazzer committed
125
    def construct_requests(self, doc, ctx):
bzantium's avatar
bzantium committed
126
127
128
        ll_true, _ = rf.loglikelihood(ctx, " True")
        ll_false, _ = rf.loglikelihood(ctx, " False")
        ll_neither, _ = rf.loglikelihood(ctx, " Neither")
129

thomasw21's avatar
Fix CB  
thomasw21 committed
130
        return ll_true, ll_false, ll_neither
thefazzer's avatar
thefazzer committed
131
132
133

    def process_results(self, doc, results):
        gold = doc["label"]
thefazzer's avatar
thefazzer committed
134
        pred = np.argmax(results)
bzantium's avatar
bzantium committed
135
136
137
        acc = 1.0 if pred == gold else 0.0

        return {"acc": acc, "f1": (pred, gold)}
Jason Phang's avatar
Jason Phang committed
138

thefazzer's avatar
thefazzer committed
139
    def higher_is_better(self):
bzantium's avatar
bzantium committed
140
        return {"acc": True, "f1": True}
Jason Phang's avatar
Jason Phang committed
141
142
143
144
145
146
147
148
149
150
151

    @classmethod
    def cb_multi_fi(cls, items):
        preds, golds = zip(*items)
        preds = np.array(preds)
        golds = np.array(golds)
        f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
        f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
        f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
        avg_f1 = mean([f11, f12, f13])
        return avg_f1
bzantium's avatar
bzantium committed
152

thefazzer's avatar
thefazzer committed
153
154
    def aggregation(self):
        return {
thefazzer's avatar
thefazzer committed
155
            "acc": mean,
Jason Phang's avatar
Jason Phang committed
156
            "f1": self.cb_multi_fi,
thefazzer's avatar
thefazzer committed
157
        }
Jason Phang's avatar
Jason Phang committed
158

Jason Phang's avatar
Jason Phang committed
159

Jonathan Tow's avatar
Jonathan Tow committed
160
class Copa(Task):
Leo Gao's avatar
Leo Gao committed
161
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
162
163
    DATASET_PATH = "super_glue"
    DATASET_NAME = "copa"
Jason Phang's avatar
Jason Phang committed
164
165
166
167
168
169
170
171

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
172
        return False
Jason Phang's avatar
Jason Phang committed
173

Jonathan Tow's avatar
Jonathan Tow committed
174
175
176
177
178
179
180
181
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

182
    def doc_to_text(self, doc):
Jason Phang's avatar
Jason Phang committed
183
        # Drop the period
Jason Phang's avatar
Jason Phang committed
184
185
186
187
        connector = {
            "cause": "because",
            "effect": "therefore",
        }[doc["question"]]
188
        return doc["premise"].strip()[:-1] + f" {connector}"
Jason Phang's avatar
Jason Phang committed
189

thefazzer's avatar
thefazzer committed
190
    def doc_to_target(self, doc):
191
192
        correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
        # Connect the sentences
193
        return " " + self.convert_choice(correct_choice)
thefazzer's avatar
thefazzer committed
194
195

    def construct_requests(self, doc, ctx):
thefazzer's avatar
thefazzer committed
196
197
        choice1 = " " + self.convert_choice(doc["choice1"])
        choice2 = " " + self.convert_choice(doc["choice2"])
bzantium's avatar
bzantium committed
198

thefazzer's avatar
thefazzer committed
199
200
201
202
203
204
205
206
        ll_choice1, _ = rf.loglikelihood(ctx, choice1)
        ll_choice2, _ = rf.loglikelihood(ctx, choice2)

        return ll_choice1, ll_choice2

    def process_results(self, doc, results):
        gold = doc["label"]
        pred = np.argmax(results)
bzantium's avatar
bzantium committed
207
208
209
        acc = 1.0 if pred == gold else 0.0

        return {"acc": acc}
thefazzer's avatar
thefazzer committed
210
211

    def higher_is_better(self):
bzantium's avatar
bzantium committed
212
213
        return {"acc": True}

thefazzer's avatar
thefazzer committed
214
    def aggregation(self):
bzantium's avatar
bzantium committed
215
        return {"acc": mean}
Jason Phang's avatar
Jason Phang committed
216
217
218
219
220
221

    @staticmethod
    def convert_choice(choice):
        return choice[0].lower() + choice[1:]


Jonathan Tow's avatar
Jonathan Tow committed
222
class MultiRC(Task):
223
    VERSION = 1
Leo Gao's avatar
Leo Gao committed
224
225
    DATASET_PATH = "super_glue"
    DATASET_NAME = "multirc"
Jason Phang's avatar
multirc  
Jason Phang committed
226
227
228
229
230
231
232
233

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
234
        return False
Jason Phang's avatar
multirc  
Jason Phang committed
235

Jonathan Tow's avatar
Jonathan Tow committed
236
237
238
239
240
241
242
243
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

244
    def doc_to_text(self, doc):
Leo Gao's avatar
Leo Gao committed
245
        return f"{doc['paragraph']}\nQuestion: {doc['question']}\nAnswer:"
246
247

    def doc_to_target(self, doc):
Leo Gao's avatar
Leo Gao committed
248
        return " " + self.format_answer(answer=doc["answer"], label=doc["label"])
Jason Phang's avatar
multirc  
Jason Phang committed
249
250
251

    @staticmethod
    def format_answer(answer, label):
Leo Gao's avatar
Fix  
Leo Gao committed
252
        label_str = "yes" if label else "no"
thomasw21's avatar
thomasw21 committed
253
        return f"{answer}\nIs the answer correct? {label_str}"
Jason Phang's avatar
multirc  
Jason Phang committed
254

thefazzer's avatar
thefazzer committed
255
256
257
    def construct_requests(self, doc, ctx):
        true_choice = self.format_answer(answer=doc["answer"], label=True)
        false_choice = self.format_answer(answer=doc["answer"], label=False)
bzantium's avatar
bzantium committed
258
259
260

        ll_true_choice, _ = rf.loglikelihood(ctx, f" {true_choice}")
        ll_false_choice, _ = rf.loglikelihood(ctx, f" {false_choice}")
thefazzer's avatar
thefazzer committed
261
262
263
264

        return ll_true_choice, ll_false_choice

    def process_results(self, doc, results):
thomasw21's avatar
thomasw21 committed
265
266
        ll_true_choice, ll_false_choice = results
        pred = ll_true_choice > ll_false_choice
bzantium's avatar
bzantium committed
267
268
        return {"acc": (pred, doc)}

thefazzer's avatar
thefazzer committed
269
    def higher_is_better(self):
bzantium's avatar
bzantium committed
270
271
        return {"acc": True}

thefazzer's avatar
thefazzer committed
272
    def aggregation(self):
bzantium's avatar
bzantium committed
273
        return {"acc": acc_all}
Jason Phang's avatar
multirc  
Jason Phang committed
274

Jason Phang's avatar
Jason Phang committed
275

Jonathan Tow's avatar
Jonathan Tow committed
276
class ReCoRD(Task):
Leo Gao's avatar
Leo Gao committed
277
    VERSION = 0
Jason Phang's avatar
Jason Phang committed
278
279
280
281
282
283
284
285
286
287
    DATASET_PATH = "super_glue"
    DATASET_NAME = "record"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
Leo Gao's avatar
Leo Gao committed
288
        return False
Jason Phang's avatar
Jason Phang committed
289
290
291
292

    def training_docs(self):
        # In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
        # Each doc consists of multiple answer candidates, each of which is scored yes/no.
293
294
        if self._training_docs is None:
            self._training_docs = []
Jonathan Tow's avatar
Jonathan Tow committed
295
            for doc in self.dataset["train"]:
Jason Phang's avatar
Jason Phang committed
296
                self._training_docs.append(self._process_doc(doc))
297
298
299
        return self._training_docs

    def validation_docs(self):
Jason Phang's avatar
Jason Phang committed
300
        # See: training_docs
Jonathan Tow's avatar
Jonathan Tow committed
301
        for doc in self.dataset["validation"]:
Jason Phang's avatar
Jason Phang committed
302
303
304
305
306
307
308
309
310
311
            yield self._process_doc(doc)

    @classmethod
    def _process_doc(cls, doc):
        return {
            "passage": doc["passage"],
            "query": doc["query"],
            "entities": sorted(list(set(doc["entities"]))),
            "answers": sorted(list(set(doc["answers"]))),
        }
Jason Phang's avatar
Jason Phang committed
312
313
314
315
316
317
318
319
320
321

    def doc_to_text(self, doc):
        initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
        text = initial_text + "\n\n"
        for highlight in highlights:
            text += f"  - {highlight}.\n"
        return text

    @classmethod
    def format_answer(cls, query, entity):
bzantium's avatar
bzantium committed
322
        return f"  - {query}".replace("@placeholder", entity)
Jason Phang's avatar
Jason Phang committed
323
324

    def doc_to_target(self, doc):
Jason Phang's avatar
Jason Phang committed
325
326
        # We only output the first correct entity in a doc
        return self.format_answer(query=doc["query"], entity=doc["answers"][0])
Jason Phang's avatar
Jason Phang committed
327
328
329
330

    def construct_requests(self, doc, ctx):
        requests = [
            rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity))
Jason Phang's avatar
Jason Phang committed
331
            for entity in doc["entities"]
Jason Phang's avatar
Jason Phang committed
332
333
334
335
336
337
338
339
        ]
        return requests

    def process_results(self, doc, results):
        # ReCoRD's evaluation is actually deceptively simple:
        # - Pick the maximum likelihood prediction entity
        # - Evaluate the accuracy and token F1 PER EXAMPLE
        # - Average over all examples
Jason Phang's avatar
Jason Phang committed
340
        max_idx = np.argmax(np.array([result[0] for result in results]))
Leo Gao's avatar
Leo Gao committed
341

Jason Phang's avatar
Jason Phang committed
342
        prediction = doc["entities"][max_idx]
Jason Phang's avatar
Jason Phang committed
343
        gold_label_set = doc["answers"]
bzantium's avatar
bzantium committed
344
345
346
347
348
349
        f1 = metric_max_over_ground_truths(
            squad_metrics.compute_f1, prediction, gold_label_set
        )
        em = metric_max_over_ground_truths(
            squad_metrics.compute_exact, prediction, gold_label_set
        )
Jason Phang's avatar
Jason Phang committed
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368

        return {
            "f1": f1,
            "em": em,
        }

    def higher_is_better(self):
        return {
            "f1": True,
            "em": True,
        }

    def aggregation(self):
        return {
            "f1": mean,
            "em": mean,
        }


Jonathan Tow's avatar
Jonathan Tow committed
369
class WordsInContext(Task):
Leo Gao's avatar
Leo Gao committed
370
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
371
372
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wic"
Jason Phang's avatar
Jason Phang committed
373
374
375
376
377
378
379
380

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
381
        return False
Jason Phang's avatar
Jason Phang committed
382

Jonathan Tow's avatar
Jonathan Tow committed
383
384
385
386
387
388
389
390
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

391
    def doc_to_text(self, doc):
bzantium's avatar
bzantium committed
392
393
394
395
396
397
398
399
        return (
            "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the"
            " two sentences above?\nAnswer:".format(
                doc["sentence1"],
                doc["sentence2"],
                doc["sentence1"][doc["start1"] : doc["end1"]],
            )
        )
400
401
402

    def doc_to_target(self, doc):
        return " {}".format({0: "no", 1: "yes"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
403

Jason Phang's avatar
Jason Phang committed
404
    def construct_requests(self, doc, ctx):
bzantium's avatar
bzantium committed
405
406
        ll_yes, _ = rf.loglikelihood(ctx, " yes")
        ll_no, _ = rf.loglikelihood(ctx, " no")
Jason Phang's avatar
Jason Phang committed
407
408

        return ll_yes, ll_no
409

Jason Phang's avatar
Jason Phang committed
410
411
412
413
    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]

bzantium's avatar
bzantium committed
414
        acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
Jason Phang's avatar
Jason Phang committed
415

bzantium's avatar
bzantium committed
416
        return {"acc": acc}
Jason Phang's avatar
Jason Phang committed
417
418

    def higher_is_better(self):
bzantium's avatar
bzantium committed
419
        return {"acc": True}
Jason Phang's avatar
Jason Phang committed
420
421

    def aggregation(self):
bzantium's avatar
bzantium committed
422
        return {"acc": mean}
Jason Phang's avatar
Jason Phang committed
423
424


Jonathan Tow's avatar
Jonathan Tow committed
425
class SGWinogradSchemaChallenge(Task):
Leo Gao's avatar
Leo Gao committed
426
    VERSION = 0
Jason Phang's avatar
wsc  
Jason Phang committed
427
428
    # Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
    #       binary version of the task.
Leo Gao's avatar
Leo Gao committed
429
430
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wsc"
Jason Phang's avatar
Jason Phang committed
431
432
433
434
435
436
437
438

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
439
        return False
Jason Phang's avatar
Jason Phang committed
440
441
442
443

    def training_docs(self):
        if self.has_training_docs():
            if self._training_docs is None:
Jason Phang's avatar
Jason Phang committed
444
                # GPT-3 Paper's format only uses positive examples for fewshot "training"
Jason Phang's avatar
Jason Phang committed
445
                self._training_docs = [
bzantium's avatar
bzantium committed
446
                    doc for doc in self.dataset["train"] if doc["label"]
Jason Phang's avatar
Jason Phang committed
447
448
449
                ]
            return self._training_docs

Jonathan Tow's avatar
Jonathan Tow committed
450
451
452
    def validation_docs(self):
        return self.dataset["validation"]

453
    def doc_to_text(self, doc):
Jason Phang's avatar
Jason Phang committed
454
        raw_passage = doc["text"]
Jonathan Tow's avatar
Jonathan Tow committed
455
        # NOTE: HuggingFace span indices are word-based not character-based.
bzantium's avatar
bzantium committed
456
457
458
        pre = " ".join(raw_passage.split()[: doc["span2_index"]])
        post = raw_passage[len(pre) + len(doc["span2_text"]) + 1 :]
        passage = general_detokenize(pre + " *{}*".format(doc["span2_text"]) + post)
Jason Phang's avatar
wsc  
Jason Phang committed
459
        noun = doc["span1_text"]
Jason Phang's avatar
Jason Phang committed
460
461
462
        pronoun = doc["span2_text"]
        text = (
            f"Passage: {passage}\n"
bzantium's avatar
bzantium committed
463
            + f'Question: In the passage above, does the pronoun "*{pronoun}*" refer to "*{noun}*"?\n'
Jason Phang's avatar
Jason Phang committed
464
465
466
467
            + "Answer:"
        )
        return text

468
    def doc_to_target(self, doc):
bzantium's avatar
bzantium committed
469
        return " " + yesno(doc["label"])
470

Leo Gao's avatar
Leo Gao committed
471
    def construct_requests(self, doc, ctx):
Jason Phang's avatar
wsc  
Jason Phang committed
472

bzantium's avatar
bzantium committed
473
474
        ll_yes, _ = rf.loglikelihood(ctx, " yes")
        ll_no, _ = rf.loglikelihood(ctx, " no")
Jason Phang's avatar
wsc  
Jason Phang committed
475
476

        return ll_yes, ll_no
477

Jason Phang's avatar
Jason Phang committed
478
    def process_results(self, doc, results):
Jason Phang's avatar
wsc  
Jason Phang committed
479
480
481
        ll_yes, ll_no = results
        gold = doc["label"]

bzantium's avatar
bzantium committed
482
        acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
Jason Phang's avatar
wsc  
Jason Phang committed
483

bzantium's avatar
bzantium committed
484
        return {"acc": acc}
Anish Thite's avatar
Anish Thite committed
485

Leo Gao's avatar
Leo Gao committed
486
    def higher_is_better(self):
bzantium's avatar
bzantium committed
487
        return {"acc": True}
Jason Phang's avatar
Jason Phang committed
488
489

    def aggregation(self):
bzantium's avatar
bzantium committed
490
        return {"acc": mean}