"scripts/wan/run_wan_t2v_distill_4step_cfg.sh" did not exist on "aefaf565be061f02237e08c18c3ffdef67af3ac1"
superglue.py 14.1 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
"""
2
3
4
5
6
7
8
9
10
SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems
https://w4ngatang.github.io/static/papers/superglue.pdf

SuperGLUE is a benchmark styled after GLUE with a new set of more difficult language
understanding tasks.

Homepage: https://super.gluebenchmark.com/

TODO: WSC requires free-form generation.
Jason Phang's avatar
Jason Phang committed
11
"""
Jason Phang's avatar
Jason Phang committed
12
import numpy as np
13
14
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
15
from lm_eval.base import rf, Task
Jonathan Tow's avatar
Jonathan Tow committed
16
17
from lm_eval.metrics import mean, acc_all, metric_max_over_ground_truths, yesno
from lm_eval.utils import general_detokenize
Jason Phang's avatar
Jason Phang committed
18

Jason Phang's avatar
Jason Phang committed
19

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
_CITATION = """
@inproceedings{NEURIPS2019_4496bf24,
    author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
    booktitle = {Advances in Neural Information Processing Systems},
    editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
    pages = {},
    publisher = {Curran Associates, Inc.},
    title = {SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
    url = {https://proceedings.neurips.cc/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf},
    volume = {32},
    year = {2019}
}
"""


35
class BoolQ(Task):
36
    VERSION = 1
Leo Gao's avatar
Leo Gao committed
37
38
    DATASET_PATH = "super_glue"
    DATASET_NAME = "boolq"
Jason Phang's avatar
Jason Phang committed
39
40
41
42
43
44
45
46

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
47
        return False
Jason Phang's avatar
Jason Phang committed
48

Jonathan Tow's avatar
Jonathan Tow committed
49
50
51
52
53
54
55
56
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

57
58
59
60
61
    def doc_to_text(self, doc):
        return f"{doc['passage']}\nQuestion: {doc['question']}?\nAnswer:"
    
    def doc_to_target(self, doc):
        return " " + yesno(doc['label']) 
Jason Phang's avatar
Jason Phang committed
62

63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
    def construct_requests(self, doc, ctx):

        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
        ll_no, _ = rf.loglikelihood(ctx, ' no')

        return ll_yes, ll_no

    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

        return {
            "acc": acc
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": mean
        }


class CommitmentBank(Task):
thomasw21's avatar
thomasw21 committed
92
    VERSION = 1
Leo Gao's avatar
Leo Gao committed
93
94
    DATASET_PATH = "super_glue"
    DATASET_NAME = "cb"
Jason Phang's avatar
Jason Phang committed
95
96
97
98
99
100
101
102

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
103
        return False
Jason Phang's avatar
Jason Phang committed
104

Jonathan Tow's avatar
Jonathan Tow committed
105
106
107
108
109
110
111
112
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
    def doc_to_text(self, doc):
        return "{}\nQuestion: {}. True, False or Neither?\nAnswer:".format(
            doc["premise"],
            doc["hypothesis"],
        )

    def doc_to_target(self, doc):
        # True = entailment
        # False = contradiction
        # Neither = neutral
        return " {}".format({0: "True", 1: "False", 2: "Neither"}[doc["label"]])

    def construct_requests(self, doc, ctx):
        ll_true, _ = rf.loglikelihood(ctx, ' True')
        ll_false, _ = rf.loglikelihood(ctx, ' False')
        ll_neither, _ = rf.loglikelihood(ctx, ' Neither')

        return ll_true, ll_false, ll_neither

thefazzer's avatar
thefazzer committed
132
133
    def process_results(self, doc, results):
        gold = doc["label"]
thefazzer's avatar
thefazzer committed
134
        pred = np.argmax(results)
135
        acc = 1. if pred == gold else 0.
Jason Phang's avatar
Jason Phang committed
136

137
138
139
140
141
        return {
            "acc": acc,
            "f1": (pred, gold)
        }
    
thefazzer's avatar
thefazzer committed
142
    def higher_is_better(self):
143
144
145
146
        return {
            "acc": True,
            "f1": True
        }
Jason Phang's avatar
Jason Phang committed
147
148
149
150
151
152
153
154
155
156
157

    @classmethod
    def cb_multi_fi(cls, items):
        preds, golds = zip(*items)
        preds = np.array(preds)
        golds = np.array(golds)
        f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
        f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
        f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
        avg_f1 = mean([f11, f12, f13])
        return avg_f1
158
    
thefazzer's avatar
thefazzer committed
159
160
    def aggregation(self):
        return {
thefazzer's avatar
thefazzer committed
161
            "acc": mean,
Jason Phang's avatar
Jason Phang committed
162
            "f1": self.cb_multi_fi,
thefazzer's avatar
thefazzer committed
163
        }
Jason Phang's avatar
Jason Phang committed
164

Jason Phang's avatar
Jason Phang committed
165

166
class Copa(Task):
Leo Gao's avatar
Leo Gao committed
167
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
168
169
    DATASET_PATH = "super_glue"
    DATASET_NAME = "copa"
Jason Phang's avatar
Jason Phang committed
170
171
172
173
174
175
176
177

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
178
        return False
Jason Phang's avatar
Jason Phang committed
179

Jonathan Tow's avatar
Jonathan Tow committed
180
181
182
183
184
185
186
187
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
    def doc_to_text(self, doc):
        # Drop the period
        connector = {
            "cause": "because",
            "effect": "therefore",
        }[doc["question"]]
        return doc["premise"].strip()[:-1] + f" {connector}"

    def doc_to_target(self, doc):
        correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
        # Connect the sentences
        return " " + self.convert_choice(correct_choice)

    def construct_requests(self, doc, ctx):
        choice1 = " " + self.convert_choice(doc["choice1"])
        choice2 = " " + self.convert_choice(doc["choice2"])
        
        ll_choice1, _ = rf.loglikelihood(ctx, choice1)
        ll_choice2, _ = rf.loglikelihood(ctx, choice2)

        return ll_choice1, ll_choice2

thefazzer's avatar
thefazzer committed
210
211
212
    def process_results(self, doc, results):
        gold = doc["label"]
        pred = np.argmax(results)
213
        acc = 1. if pred == gold else 0.
thefazzer's avatar
thefazzer committed
214

215
216
217
218
        return {
            "acc": acc
        }
    
thefazzer's avatar
thefazzer committed
219
    def higher_is_better(self):
220
221
222
223
        return {
            "acc": True
        }
    
thefazzer's avatar
thefazzer committed
224
    def aggregation(self):
225
226
227
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
228
229
230
231
232
233

    @staticmethod
    def convert_choice(choice):
        return choice[0].lower() + choice[1:]


234
class MultiRC(Task):
235
    VERSION = 1
Leo Gao's avatar
Leo Gao committed
236
237
    DATASET_PATH = "super_glue"
    DATASET_NAME = "multirc"
Jason Phang's avatar
multirc  
Jason Phang committed
238
239
240
241
242
243
244
245

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
246
        return False
Jason Phang's avatar
multirc  
Jason Phang committed
247

Jonathan Tow's avatar
Jonathan Tow committed
248
249
250
251
252
253
254
255
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
    def doc_to_text(self, doc):
        return f"{doc['paragraph']}\nQuestion: {doc['question']}\nAnswer:"

    def doc_to_target(self, doc):
        return " " + self.format_answer(answer=doc["answer"], label=doc["label"])

    @staticmethod
    def format_answer(answer, label):
        label_str = "yes" if label else "no"
        return f"{answer}\nIs the answer correct? {label_str}"

    def construct_requests(self, doc, ctx):
        true_choice = self.format_answer(answer=doc["answer"], label=True)
        false_choice = self.format_answer(answer=doc["answer"], label=False)
        
        ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
        ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')

        return ll_true_choice, ll_false_choice

thefazzer's avatar
thefazzer committed
276
    def process_results(self, doc, results):
thomasw21's avatar
thomasw21 committed
277
278
        ll_true_choice, ll_false_choice = results
        pred = ll_true_choice > ll_false_choice
279
280
281
282
        return {
            "acc": (pred, doc)
        }
    
thefazzer's avatar
thefazzer committed
283
    def higher_is_better(self):
284
285
286
287
        return {
            "acc": True
        }
    
thefazzer's avatar
thefazzer committed
288
    def aggregation(self):
289
290
291
        return {
            "acc": acc_all
        }
Jason Phang's avatar
multirc  
Jason Phang committed
292

Jason Phang's avatar
Jason Phang committed
293

294
class ReCoRD(Task):
Leo Gao's avatar
Leo Gao committed
295
    VERSION = 0
Jason Phang's avatar
Jason Phang committed
296
297
298
299
300
301
302
303
304
305
    DATASET_PATH = "super_glue"
    DATASET_NAME = "record"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
Leo Gao's avatar
Leo Gao committed
306
        return False
Jason Phang's avatar
Jason Phang committed
307
308
309
310

    def training_docs(self):
        # In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
        # Each doc consists of multiple answer candidates, each of which is scored yes/no.
311
312
        if self._training_docs is None:
            self._training_docs = []
Jonathan Tow's avatar
Jonathan Tow committed
313
            for doc in self.dataset["train"]:
314
                self._training_docs.append(self._process_doc(doc))
315
316
317
        return self._training_docs

    def validation_docs(self):
Jason Phang's avatar
Jason Phang committed
318
        # See: training_docs
Jonathan Tow's avatar
Jonathan Tow committed
319
        for doc in self.dataset["validation"]:
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
            yield self._process_doc(doc)

    @classmethod
    def _process_doc(cls, doc):
        return {
            "passage": doc["passage"],
            "query": doc["query"],
            "entities": sorted(list(set(doc["entities"]))),
            "answers": sorted(list(set(doc["answers"]))),
        }

    def doc_to_text(self, doc):
        initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
        text = initial_text + "\n\n"
        for highlight in highlights:
            text += f"  - {highlight}.\n"
        return text

    @classmethod
    def format_answer(cls, query, entity):
        return f'  - {query}'.replace("@placeholder", entity)

    def doc_to_target(self, doc):
        # We only output the first correct entity in a doc
        return self.format_answer(query=doc["query"], entity=doc["answers"][0])

    def construct_requests(self, doc, ctx):
        requests = [
            rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity))
            for entity in doc["entities"]
        ]
        return requests
Jason Phang's avatar
Jason Phang committed
352
353
354
355
356
357

    def process_results(self, doc, results):
        # ReCoRD's evaluation is actually deceptively simple:
        # - Pick the maximum likelihood prediction entity
        # - Evaluate the accuracy and token F1 PER EXAMPLE
        # - Average over all examples
Jason Phang's avatar
Jason Phang committed
358
        max_idx = np.argmax(np.array([result[0] for result in results]))
Leo Gao's avatar
Leo Gao committed
359

Jason Phang's avatar
Jason Phang committed
360
        prediction = doc["entities"][max_idx]
Jason Phang's avatar
Jason Phang committed
361
        gold_label_set = doc["answers"]
362
363
        f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
        em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set)
Jason Phang's avatar
Jason Phang committed
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

        return {
            "f1": f1,
            "em": em,
        }

    def higher_is_better(self):
        return {
            "f1": True,
            "em": True,
        }

    def aggregation(self):
        return {
            "f1": mean,
            "em": mean,
        }


383
class WordsInContext(Task):
Leo Gao's avatar
Leo Gao committed
384
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
385
386
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wic"
Jason Phang's avatar
Jason Phang committed
387
388
389
390
391
392
393
394

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
395
        return False
Jason Phang's avatar
Jason Phang committed
396

Jonathan Tow's avatar
Jonathan Tow committed
397
398
399
400
401
402
403
404
    def training_docs(self):
        if self._training_docs is None:
            self._training_docs = list(self.dataset["train"])
        return self._training_docs

    def validation_docs(self):
        return self.dataset["validation"]

405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
    def doc_to_text(self, doc):
        return "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the" \
               " two sentences above?\nAnswer:".format(
                    doc["sentence1"],
                    doc["sentence2"],
                    doc["sentence1"][doc["start1"]:doc["end1"]],
                )

    def doc_to_target(self, doc):
        return " {}".format({0: "no", 1: "yes"}[doc["label"]])

    def construct_requests(self, doc, ctx):
        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
        ll_no, _ = rf.loglikelihood(ctx, ' no')

        return ll_yes, ll_no

    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

        return {
            "acc": acc
        }

Jason Phang's avatar
Jason Phang committed
432
    def higher_is_better(self):
433
434
435
        return {
            "acc": True
        }
Jason Phang's avatar
Jason Phang committed
436
437

    def aggregation(self):
438
439
440
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
441
442


443
class SGWinogradSchemaChallenge(Task):
Leo Gao's avatar
Leo Gao committed
444
    VERSION = 0
Jason Phang's avatar
wsc  
Jason Phang committed
445
446
    # Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
    #       binary version of the task.
Leo Gao's avatar
Leo Gao committed
447
    DATASET_PATH = "super_glue"
448
    DATASET_NAME = "wsc"
Jason Phang's avatar
Jason Phang committed
449
450
451
452
453
454
455
456

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
457
        return False
Jason Phang's avatar
Jason Phang committed
458
459
460
461

    def training_docs(self):
        if self.has_training_docs():
            if self._training_docs is None:
Jason Phang's avatar
Jason Phang committed
462
                # GPT-3 Paper's format only uses positive examples for fewshot "training"
Jason Phang's avatar
Jason Phang committed
463
                self._training_docs = [
464
465
466
                    doc for doc in
                    self.dataset["train"]
                    if doc["label"]
Jason Phang's avatar
Jason Phang committed
467
468
469
                ]
            return self._training_docs

Jonathan Tow's avatar
Jonathan Tow committed
470
471
472
    def validation_docs(self):
        return self.dataset["validation"]

473
474
475
476
477
478
479
480
481
482
483
484
485
486
    def doc_to_text(self, doc):
        raw_passage = doc["text"]
        # NOTE: HuggingFace span indices are word-based not character-based.
        pre = " ".join(raw_passage.split()[:doc["span2_index"]])
        post = raw_passage[len(pre) + len(doc["span2_text"]) + 1:]
        passage = general_detokenize(pre + " *{}*".format(doc['span2_text']) + post)
        noun = doc["span1_text"]
        pronoun = doc["span2_text"]
        text = (
            f"Passage: {passage}\n"
            + f"Question: In the passage above, does the pronoun \"*{pronoun}*\" refer to \"*{noun}*\"?\n"
            + "Answer:"
        )
        return text
jon-tow's avatar
jon-tow committed
487

488
489
    def doc_to_target(self, doc):
        return " " + yesno(doc['label'])
jon-tow's avatar
jon-tow committed
490

491
    def construct_requests(self, doc, ctx):
jon-tow's avatar
jon-tow committed
492

493
494
        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
        ll_no, _ = rf.loglikelihood(ctx, ' no')
jon-tow's avatar
jon-tow committed
495

496
        return ll_yes, ll_no
jon-tow's avatar
jon-tow committed
497

498
499
500
    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]
jon-tow's avatar
jon-tow committed
501

502
        acc = 1. if (ll_yes > ll_no) == gold else 0.
jon-tow's avatar
jon-tow committed
503

504
505
506
        return {
            "acc": acc
        }
jon-tow's avatar
jon-tow committed
507

508
509
510
511
    def higher_is_better(self):
        return {
            "acc": True
        }
jon-tow's avatar
jon-tow committed
512

513
514
515
516
    def aggregation(self):
        return {
            "acc": mean
        }