superglue.py 13.4 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
2
3
4
5
"""
To-do:
    - WSC requires free-form generation
    - ReCoRD
"""
Jason Phang's avatar
Jason Phang committed
6
import numpy as np
Jason Phang's avatar
Jason Phang committed
7
8
9
10
from . common import HFTask, yesno
from lm_eval.base import rf, mean, acc_all, metric_max_over_ground_truths
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
Leo Gao's avatar
Fix  
Leo Gao committed
11
from ..utils import general_detokenize
Jason Phang's avatar
Jason Phang committed
12

Jason Phang's avatar
Jason Phang committed
13

14
class BoolQ(HFTask):
Leo Gao's avatar
Leo Gao committed
15
16
    DATASET_PATH = "super_glue"
    DATASET_NAME = "boolq"
Jason Phang's avatar
Jason Phang committed
17
18
19
20
21
22
23
24
25
26
27

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
28
        # TODO: figure out actual description
Jason Phang's avatar
Jason Phang committed
29
30
        return "Read the following passages and answer each question with a yes or a no."

Leo Gao's avatar
Update  
Leo Gao committed
31
    def doc_to_text(self, doc):
Leo Gao's avatar
Leo Gao committed
32
        return f"{doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
Leo Gao's avatar
Update  
Leo Gao committed
33
34
    
    def doc_to_target(self, doc):
35
        return " " + yesno(doc['label']) 
Jason Phang's avatar
Jason Phang committed
36

37
    def construct_requests(self, doc, ctx):
Leo Gao's avatar
Update  
Leo Gao committed
38

39
        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
Jason Phang's avatar
Jason Phang committed
40
        ll_no, _ = rf.loglikelihood(ctx, ' no')
Leo Gao's avatar
Update  
Leo Gao committed
41
42
43
44
45
46
47
48
49

        return ll_yes, ll_no

    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

50
51
52
53
54
55
56
57
58
59
60
61
62
        return {
            "acc": acc
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
63

Jason Phang's avatar
Jason Phang committed
64

65
class CommitmentBank(HFTask):
Leo Gao's avatar
Leo Gao committed
66
67
    DATASET_PATH = "super_glue"
    DATASET_NAME = "cb"
Jason Phang's avatar
Jason Phang committed
68
69
70
71
72
73
74
75
76
77

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

thefazzer's avatar
thefazzer committed
78
    def fewshot_description(self):
79
        # TODO: figure out actual description
Jason Phang's avatar
Jason Phang committed
80
81
        return "Given a premise and a hypothesis, classify whether the author of the premise is committed" \
            "to the truth of the hypothesis. The three possible labels are true, false or neither."
thefazzer's avatar
thefazzer committed
82

83
    def doc_to_text(self, doc):
Leo Gao's avatar
Leo Gao committed
84
        return "{}\nQuestion: {}. True, False or Neither?\nAnswer:".format(
Jason Phang's avatar
Jason Phang committed
85
86
87
            doc["premise"],
            doc["hypothesis"],
        )
88

thefazzer's avatar
thefazzer committed
89
    def doc_to_target(self, doc):
90
91
92
        # True = entailment
        # False = contradiction
        # Neither = neutral
Leo Gao's avatar
Leo Gao committed
93
        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
94

thefazzer's avatar
thefazzer committed
95
    def construct_requests(self, doc, ctx):
Leo Gao's avatar
Leo Gao committed
96
97
98
        ll_true, _ = rf.loglikelihood(ctx, ' True')
        ll_neither, _ = rf.loglikelihood(ctx, ' Neither')
        ll_false, _ = rf.loglikelihood(ctx, ' False')
99

thefazzer's avatar
thefazzer committed
100
101
102
103
        return ll_true, ll_neither, ll_false

    def process_results(self, doc, results):
        gold = doc["label"]
thefazzer's avatar
thefazzer committed
104
105
        pred = np.argmax(results)
        acc = 1. if pred == gold else 0.
Jason Phang's avatar
Jason Phang committed
106

thefazzer's avatar
thefazzer committed
107
        return {
thefazzer's avatar
thefazzer committed
108
109
            "acc": acc,
            "f1": (pred, gold)
thefazzer's avatar
thefazzer committed
110
111
112
113
        }
    
    def higher_is_better(self):
        return {
114
115
            "acc": True,
            "f1": True
thefazzer's avatar
thefazzer committed
116
        }
Jason Phang's avatar
Jason Phang committed
117
118
119
120
121
122
123
124
125
126
127

    @classmethod
    def cb_multi_fi(cls, items):
        preds, golds = zip(*items)
        preds = np.array(preds)
        golds = np.array(golds)
        f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
        f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
        f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
        avg_f1 = mean([f11, f12, f13])
        return avg_f1
thefazzer's avatar
thefazzer committed
128
129
130
    
    def aggregation(self):
        return {
thefazzer's avatar
thefazzer committed
131
            "acc": mean,
Jason Phang's avatar
Jason Phang committed
132
            "f1": self.cb_multi_fi,
thefazzer's avatar
thefazzer committed
133
        }
Jason Phang's avatar
Jason Phang committed
134

Jason Phang's avatar
Jason Phang committed
135

136
class Copa(HFTask):
Leo Gao's avatar
Leo Gao committed
137
138
    DATASET_PATH = "super_glue"
    DATASET_NAME = "copa"
Jason Phang's avatar
Jason Phang committed
139
140
141
142
143
144
145
146
147
148

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

thefazzer's avatar
thefazzer committed
149
    def fewshot_description(self):
150
        # TODO: figure out actual description
Jason Phang's avatar
Jason Phang committed
151
152
        return "Given a premise and one alternative with a causal relation to the premise and another without," \
            "choose the more plausible alternative"
thefazzer's avatar
thefazzer committed
153

154
    def doc_to_text(self, doc):
Jason Phang's avatar
Jason Phang committed
155
        # Drop the period
Jason Phang's avatar
Jason Phang committed
156
157
158
159
        connector = {
            "cause": "because",
            "effect": "therefore",
        }[doc["question"]]
160
        return doc["premise"].strip()[:-1] + f" {connector}"
Jason Phang's avatar
Jason Phang committed
161

thefazzer's avatar
thefazzer committed
162
    def doc_to_target(self, doc):
163
164
        correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
        # Connect the sentences
165
        return " " + self.convert_choice(correct_choice)
thefazzer's avatar
thefazzer committed
166
167

    def construct_requests(self, doc, ctx):
thefazzer's avatar
thefazzer committed
168
169
        choice1 = " " + self.convert_choice(doc["choice1"])
        choice2 = " " + self.convert_choice(doc["choice2"])
thefazzer's avatar
thefazzer committed
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
        
        ll_choice1, _ = rf.loglikelihood(ctx, choice1)
        ll_choice2, _ = rf.loglikelihood(ctx, choice2)

        return ll_choice1, ll_choice2

    def process_results(self, doc, results):
        gold = doc["label"]
        pred = np.argmax(results)
        acc = 1. if pred == gold else 0.

        return {
            "acc": acc
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
194
195
196
197
198
199

    @staticmethod
    def convert_choice(choice):
        return choice[0].lower() + choice[1:]


200
class MultiRC(HFTask):
Leo Gao's avatar
Leo Gao committed
201
202
    DATASET_PATH = "super_glue"
    DATASET_NAME = "multirc"
Jason Phang's avatar
multirc  
Jason Phang committed
203
204
205
206
207
208
209
210
211
212
213

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
214
        # TODO: figure out actual description
Jason Phang's avatar
multirc  
Jason Phang committed
215
216
        return "READING COMPREHENSION ANSWER KEY"

217
    def doc_to_text(self, doc):
Leo Gao's avatar
Leo Gao committed
218
        return f"{doc['paragraph']}\nQuestion: {doc['question']}\nAnswer:"
219
220

    def doc_to_target(self, doc):
Leo Gao's avatar
Leo Gao committed
221
        return " " + self.format_answer(answer=doc["answer"], label=doc["label"])
Jason Phang's avatar
multirc  
Jason Phang committed
222
223
224

    @staticmethod
    def format_answer(answer, label):
Leo Gao's avatar
Fix  
Leo Gao committed
225
        label_str = "yes" if label else "no"
Leo Gao's avatar
Leo Gao committed
226
        return f"{label_str}, {answer}"
Jason Phang's avatar
multirc  
Jason Phang committed
227

thefazzer's avatar
thefazzer committed
228
229
230
231
232
233
234
235
236
237
238
    def construct_requests(self, doc, ctx):
        true_choice = self.format_answer(answer=doc["answer"], label=True)
        false_choice = self.format_answer(answer=doc["answer"], label=False)
        
        ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
        ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')

        return ll_true_choice, ll_false_choice

    def process_results(self, doc, results):
        pred = np.argmax(results)
Jason Phang's avatar
multirc  
Jason Phang committed
239
        return {
thefazzer's avatar
thefazzer committed
240
241
242
243
244
245
246
247
248
249
250
            "acc": (pred, doc)
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": acc_all
Jason Phang's avatar
multirc  
Jason Phang committed
251
252
        }

Jason Phang's avatar
Jason Phang committed
253
254
255
256
257
258
259
260
261
262
263
264

class ReCoRD(HFTask):
    DATASET_PATH = "super_glue"
    DATASET_NAME = "record"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
Leo Gao's avatar
Leo Gao committed
265
        return False
Jason Phang's avatar
Jason Phang committed
266

267
268
269
270
    def fewshot_description(self):
        # TODO: figure out actual description
        return ""

Jason Phang's avatar
Jason Phang committed
271
272
273
274
275
276
    def training_docs(self):
        # In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
        # Each doc consists of multiple answer candidates, each of which is scored yes/no.
        # Hence, we one "doc" for each (context + passage, answer) pair.
        # Moreover, we only use the correct answers for context packing
        # (This is not an issue for evaluation, where we can directly score multiple candidates at once).
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
        if self._training_docs is None:
            self._training_docs = []
            for doc in self.data["train"]:
                for entity in list(set(doc["entities"])):
                    self._training_docs.append({
                        "passage": doc["passage"],
                        "query": doc["query"],
                        "entity": entity,
                        "label": entity in doc["answers"],
                    })
        return self._training_docs

    def validation_docs(self):
        for doc in self.data["validation"]:
            for entity in list(set(doc["entities"])):
                yield {
                    "passage": doc["passage"],
                    "query": doc["query"],
                    "entity": entity,
                    "label": entity in doc["answers"],
                }
Jason Phang's avatar
Jason Phang committed
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315

    def doc_to_text(self, doc):
        initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
        text = initial_text + "\n\n"
        for highlight in highlights:
            text += f"  - {highlight}.\n"
        return text

    @classmethod
    def format_answer(cls, query, entity):
        return f'  - {query}'.replace("@placeholder", entity)

    def doc_to_target(self, doc):
        return self.format_answer(query=doc["query"], entity=doc["entity"])

    def construct_requests(self, doc, ctx):
        requests = [
            rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity))
316
            for entity in doc["entity"]
Jason Phang's avatar
Jason Phang committed
317
318
319
320
321
322
323
324
325
        ]
        return requests

    def process_results(self, doc, results):
        # ReCoRD's evaluation is actually deceptively simple:
        # - Pick the maximum likelihood prediction entity
        # - Evaluate the accuracy and token F1 PER EXAMPLE
        # - Average over all examples
        max_idx = np.argmax(np.array(results))
Leo Gao's avatar
Leo Gao committed
326

Jason Phang's avatar
Jason Phang committed
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
        prediction = doc["entities"][max_idx]
        gold_label_set = list(set(doc["answers"]))
        f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
        em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set)

        return {
            "f1": f1,
            "em": em,
        }

    def higher_is_better(self):
        return {
            "f1": True,
            "em": True,
        }

    def aggregation(self):
        return {
            "f1": mean,
            "em": mean,
        }


350
class WordsInContext(HFTask):
Leo Gao's avatar
Leo Gao committed
351
352
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wic"
Jason Phang's avatar
Jason Phang committed
353
354
355
356
357
358
359
360
361
362

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

363
364
365
366
    def fewshot_description(self):
        # TODO: figure out actual description
        return ""

367
    def doc_to_text(self, doc):
Leo Gao's avatar
Leo Gao committed
368
369
        return "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the" \
               " two sentences above?\nAnswer:".format(
Jason Phang's avatar
Jason Phang committed
370
371
372
373
                    doc["sentence1"],
                    doc["sentence2"],
                    doc["sentence1"][doc["start1"]:doc["end1"]],
                )
374
375
376

    def doc_to_target(self, doc):
        return " {}".format({0: "no", 1: "yes"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
377

Jason Phang's avatar
Jason Phang committed
378
379
380
381
382
    def construct_requests(self, doc, ctx):
        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
        ll_no, _ = rf.loglikelihood(ctx, ' no')

        return ll_yes, ll_no
383

Jason Phang's avatar
Jason Phang committed
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

        return {
            "acc": acc
        }

    def higher_is_better(self):
        return {
            "acc": True
        }

    def aggregation(self):
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
403
404


405
class SGWinogradSchemaChallenge(HFTask):
Jason Phang's avatar
wsc  
Jason Phang committed
406
407
    # Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
    #       binary version of the task.
Leo Gao's avatar
Leo Gao committed
408
409
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wsc"
Jason Phang's avatar
Jason Phang committed
410
411
412
413
414
415
416
417
418
419
420
421
422

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def training_docs(self):
        if self.has_training_docs():
            if self._training_docs is None:
Jason Phang's avatar
Jason Phang committed
423
                # GPT-3 Paper's format only uses positive examples for fewshot "training"
Jason Phang's avatar
Jason Phang committed
424
425
                self._training_docs = [
                    doc for doc in
Jason Phang's avatar
Jason Phang committed
426
                    self.data["train"]
Jason Phang's avatar
Jason Phang committed
427
428
429
430
431
432
433
434
435
436
                    if doc["label"]
                ]
            return self._training_docs

    def fewshot_description(self):
        return "Final Exam with Answer Key\n" \
           "Instructions: Please carefully read the following passages. " \
           "For each passage, you must identify which noun the pronoun marked in *bold*" \
           " refers to.\n====="

437
    def doc_to_text(self, doc):
Jason Phang's avatar
Jason Phang committed
438
        raw_passage = doc["text"]
Jonathan Tow's avatar
Jonathan Tow committed
439
440
441
        # NOTE: HuggingFace span indices are word-based not character-based.
        pre = " ".join(raw_passage.split()[:doc["span2_index"]])
        post = raw_passage[len(pre) + len(doc["span2_text"]) + 1:]
Leo Gao's avatar
Leo Gao committed
442
        passage = general_detokenize(pre + " *{}*".format(doc['span2_text']) + post)
Jason Phang's avatar
wsc  
Jason Phang committed
443
        noun = doc["span1_text"]
Jason Phang's avatar
Jason Phang committed
444
445
446
        pronoun = doc["span2_text"]
        text = (
            f"Passage: {passage}\n"
Jason Phang's avatar
wsc  
Jason Phang committed
447
            + f"Question: In the passage above, does the pronoun \"*{pronoun}*\" refer to \"*{noun}*\"?\n"
Jason Phang's avatar
Jason Phang committed
448
449
450
451
            + "Answer:"
        )
        return text

452
    def doc_to_target(self, doc):
Leo Gao's avatar
Leo Gao committed
453
        return " " + yesno(doc['label'])
454

Leo Gao's avatar
Leo Gao committed
455
    def construct_requests(self, doc, ctx):
Jason Phang's avatar
wsc  
Jason Phang committed
456
457
458
459
460

        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
        ll_no, _ = rf.loglikelihood(ctx, ' no')

        return ll_yes, ll_no
461

Jason Phang's avatar
Jason Phang committed
462
    def process_results(self, doc, results):
Jason Phang's avatar
wsc  
Jason Phang committed
463
464
465
466
467
468
469
470
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

        return {
            "acc": acc
        }
Anish Thite's avatar
Anish Thite committed
471

Leo Gao's avatar
Leo Gao committed
472
    def higher_is_better(self):
Jason Phang's avatar
Jason Phang committed
473
474
475
476
477
478
479
480
        return {
            "acc": True
        }

    def aggregation(self):
        return {
            "acc": mean
        }