superglue.py 13.2 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
2
3
4
5
"""
To-do:
    - WSC requires free-form generation
    - ReCoRD
"""
Jason Phang's avatar
Jason Phang committed
6
import numpy as np
Jason Phang's avatar
Jason Phang committed
7
8
9
10
11
from . common import HFTask, yesno
from lm_eval.base import rf, mean, acc_all, metric_max_over_ground_truths
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics

Jason Phang's avatar
Jason Phang committed
12

13
class BoolQ(HFTask):
Leo Gao's avatar
Leo Gao committed
14
15
    DATASET_PATH = "super_glue"
    DATASET_NAME = "boolq"
Jason Phang's avatar
Jason Phang committed
16
17
18
19
20
21
22
23
24
25
26

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
27
        # TODO: figure out actual description
Jason Phang's avatar
Jason Phang committed
28
29
        return "Read the following passages and answer each question with a yes or a no."

Leo Gao's avatar
Update  
Leo Gao committed
30
31
32
33
34
    def doc_to_text(self, doc):
        return f"{doc['passage']}\nquestion: {doc['question']}\nanswer: "
    
    def doc_to_target(self, doc):
        return yesno(doc['label']) 
Jason Phang's avatar
Jason Phang committed
35

36
    def construct_requests(self, doc, ctx):
Leo Gao's avatar
Update  
Leo Gao committed
37

38
        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
Jason Phang's avatar
Jason Phang committed
39
        ll_no, _ = rf.loglikelihood(ctx, ' no')
Leo Gao's avatar
Update  
Leo Gao committed
40
41
42
43
44
45
46
47
48

        return ll_yes, ll_no

    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

49
50
51
52
53
54
55
56
57
58
59
60
61
        return {
            "acc": acc
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
62

Jason Phang's avatar
Jason Phang committed
63

64
class CommitmentBank(HFTask):
Leo Gao's avatar
Leo Gao committed
65
66
    DATASET_PATH = "super_glue"
    DATASET_NAME = "cb"
Jason Phang's avatar
Jason Phang committed
67
68
69
70
71
72
73
74
75
76

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

thefazzer's avatar
thefazzer committed
77
    def fewshot_description(self):
78
        # TODO: figure out actual description
Jason Phang's avatar
Jason Phang committed
79
80
        return "Given a premise and a hypothesis, classify whether the author of the premise is committed" \
            "to the truth of the hypothesis. The three possible labels are true, false or neither."
thefazzer's avatar
thefazzer committed
81

82
    def doc_to_text(self, doc):
83
        return "{}\nquestion: {} true, false or neither?\nanswer:".format(
Jason Phang's avatar
Jason Phang committed
84
85
86
            doc["premise"],
            doc["hypothesis"],
        )
87

thefazzer's avatar
thefazzer committed
88
    def doc_to_target(self, doc):
89
90
91
92
        # True = entailment
        # False = contradiction
        # Neither = neutral
        return " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
93

thefazzer's avatar
thefazzer committed
94
95
96
97
    def construct_requests(self, doc, ctx):
        ll_true, _ = rf.loglikelihood(ctx, ' true')
        ll_neither, _ = rf.loglikelihood(ctx, ' neither')
        ll_false, _ = rf.loglikelihood(ctx, ' false')
98

thefazzer's avatar
thefazzer committed
99
100
101
102
        return ll_true, ll_neither, ll_false

    def process_results(self, doc, results):
        gold = doc["label"]
thefazzer's avatar
thefazzer committed
103
104
        pred = np.argmax(results)
        acc = 1. if pred == gold else 0.
Jason Phang's avatar
Jason Phang committed
105

thefazzer's avatar
thefazzer committed
106
        return {
thefazzer's avatar
thefazzer committed
107
108
            "acc": acc,
            "f1": (pred, gold)
thefazzer's avatar
thefazzer committed
109
110
111
112
        }
    
    def higher_is_better(self):
        return {
113
114
            "acc": True,
            "f1": True
thefazzer's avatar
thefazzer committed
115
        }
Jason Phang's avatar
Jason Phang committed
116
117
118
119
120
121
122
123
124
125
126

    @classmethod
    def cb_multi_fi(cls, items):
        preds, golds = zip(*items)
        preds = np.array(preds)
        golds = np.array(golds)
        f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
        f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
        f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
        avg_f1 = mean([f11, f12, f13])
        return avg_f1
thefazzer's avatar
thefazzer committed
127
128
129
    
    def aggregation(self):
        return {
thefazzer's avatar
thefazzer committed
130
            "acc": mean,
Jason Phang's avatar
Jason Phang committed
131
            "f1": self.cb_multi_fi,
thefazzer's avatar
thefazzer committed
132
        }
Jason Phang's avatar
Jason Phang committed
133

Jason Phang's avatar
Jason Phang committed
134

135
class Copa(HFTask):
Leo Gao's avatar
Leo Gao committed
136
137
    DATASET_PATH = "super_glue"
    DATASET_NAME = "copa"
Jason Phang's avatar
Jason Phang committed
138
139
140
141
142
143
144
145
146
147

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

thefazzer's avatar
thefazzer committed
148
    def fewshot_description(self):
149
        # TODO: figure out actual description
Jason Phang's avatar
Jason Phang committed
150
151
        return "Given a premise and one alternative with a causal relation to the premise and another without," \
            "choose the more plausible alternative"
thefazzer's avatar
thefazzer committed
152

153
    def doc_to_text(self, doc):
Jason Phang's avatar
Jason Phang committed
154
        # Drop the period
Jason Phang's avatar
Jason Phang committed
155
156
157
158
        connector = {
            "cause": "because",
            "effect": "therefore",
        }[doc["question"]]
159
        return doc["premise"].strip()[:-1] + f" {connector} "
Jason Phang's avatar
Jason Phang committed
160

thefazzer's avatar
thefazzer committed
161
    def doc_to_target(self, doc):
162
163
164
        correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
        # Connect the sentences
        return self.convert_choice(correct_choice)
thefazzer's avatar
thefazzer committed
165
166

    def construct_requests(self, doc, ctx):
thefazzer's avatar
thefazzer committed
167
168
        choice1 = " " + self.convert_choice(doc["choice1"])
        choice2 = " " + self.convert_choice(doc["choice2"])
thefazzer's avatar
thefazzer committed
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
        
        ll_choice1, _ = rf.loglikelihood(ctx, choice1)
        ll_choice2, _ = rf.loglikelihood(ctx, choice2)

        return ll_choice1, ll_choice2

    def process_results(self, doc, results):
        gold = doc["label"]
        pred = np.argmax(results)
        acc = 1. if pred == gold else 0.

        return {
            "acc": acc
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
193
194
195
196
197
198

    @staticmethod
    def convert_choice(choice):
        return choice[0].lower() + choice[1:]


199
class MultiRC(HFTask):
Leo Gao's avatar
Leo Gao committed
200
201
    DATASET_PATH = "super_glue"
    DATASET_NAME = "multirc"
Jason Phang's avatar
multirc  
Jason Phang committed
202
203
204
205
206
207
208
209
210
211
212

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
213
        # TODO: figure out actual description
Jason Phang's avatar
multirc  
Jason Phang committed
214
215
        return "READING COMPREHENSION ANSWER KEY"

216
217
218
219
220
    def doc_to_text(self, doc):
        return f"{doc['paragraph']}\n\n{doc['question']}\n"

    def doc_to_target(self, doc):
        return self.format_answer(answer=doc["answer"], label=doc["label"])
Jason Phang's avatar
multirc  
Jason Phang committed
221
222
223
224
225
226

    @staticmethod
    def format_answer(answer, label):
        label_str = "True" if label else "False"
        return f"[{label_str}] {answer}"

thefazzer's avatar
thefazzer committed
227
228
229
230
231
232
233
234
235
236
237
    def construct_requests(self, doc, ctx):
        true_choice = self.format_answer(answer=doc["answer"], label=True)
        false_choice = self.format_answer(answer=doc["answer"], label=False)
        
        ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
        ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')

        return ll_true_choice, ll_false_choice

    def process_results(self, doc, results):
        pred = np.argmax(results)
Jason Phang's avatar
multirc  
Jason Phang committed
238
        return {
thefazzer's avatar
thefazzer committed
239
240
241
242
243
244
245
246
247
248
249
            "acc": (pred, doc)
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": acc_all
Jason Phang's avatar
multirc  
Jason Phang committed
250
251
        }

Jason Phang's avatar
Jason Phang committed
252
253
254
255
256
257
258
259
260
261
262
263

class ReCoRD(HFTask):
    DATASET_PATH = "super_glue"
    DATASET_NAME = "record"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
Leo Gao's avatar
Leo Gao committed
264
        return False
Jason Phang's avatar
Jason Phang committed
265

266
267
268
269
    def fewshot_description(self):
        # TODO: figure out actual description
        return ""

Jason Phang's avatar
Jason Phang committed
270
271
272
273
274
275
    def training_docs(self):
        # In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
        # Each doc consists of multiple answer candidates, each of which is scored yes/no.
        # Hence, we one "doc" for each (context + passage, answer) pair.
        # Moreover, we only use the correct answers for context packing
        # (This is not an issue for evaluation, where we can directly score multiple candidates at once).
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
        if self._training_docs is None:
            self._training_docs = []
            for doc in self.data["train"]:
                for entity in list(set(doc["entities"])):
                    self._training_docs.append({
                        "passage": doc["passage"],
                        "query": doc["query"],
                        "entity": entity,
                        "label": entity in doc["answers"],
                    })
        return self._training_docs

    def validation_docs(self):
        for doc in self.data["validation"]:
            for entity in list(set(doc["entities"])):
                yield {
                    "passage": doc["passage"],
                    "query": doc["query"],
                    "entity": entity,
                    "label": entity in doc["answers"],
                }
Jason Phang's avatar
Jason Phang committed
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314

    def doc_to_text(self, doc):
        initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
        text = initial_text + "\n\n"
        for highlight in highlights:
            text += f"  - {highlight}.\n"
        return text

    @classmethod
    def format_answer(cls, query, entity):
        return f'  - {query}'.replace("@placeholder", entity)

    def doc_to_target(self, doc):
        return self.format_answer(query=doc["query"], entity=doc["entity"])

    def construct_requests(self, doc, ctx):
        requests = [
            rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity))
315
            for entity in doc["entity"]
Jason Phang's avatar
Jason Phang committed
316
317
318
319
320
321
322
323
324
        ]
        return requests

    def process_results(self, doc, results):
        # ReCoRD's evaluation is actually deceptively simple:
        # - Pick the maximum likelihood prediction entity
        # - Evaluate the accuracy and token F1 PER EXAMPLE
        # - Average over all examples
        max_idx = np.argmax(np.array(results))
Leo Gao's avatar
Leo Gao committed
325

Jason Phang's avatar
Jason Phang committed
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
        prediction = doc["entities"][max_idx]
        gold_label_set = list(set(doc["answers"]))
        f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
        em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set)

        return {
            "f1": f1,
            "em": em,
        }

    def higher_is_better(self):
        return {
            "f1": True,
            "em": True,
        }

    def aggregation(self):
        return {
            "f1": mean,
            "em": mean,
        }


349
class WordsInContext(HFTask):
Leo Gao's avatar
Leo Gao committed
350
351
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wic"
Jason Phang's avatar
Jason Phang committed
352
353
354
355
356
357
358
359
360
361

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

362
363
364
365
    def fewshot_description(self):
        # TODO: figure out actual description
        return ""

366
    def doc_to_text(self, doc):
367
        return "{}\n{}\nQuestion: Is the word '{}' used in the same way in the" \
Jason Phang's avatar
Jason Phang committed
368
369
370
371
372
               " two sentences above?\nanswer:".format(
                    doc["sentence1"],
                    doc["sentence2"],
                    doc["sentence1"][doc["start1"]:doc["end1"]],
                )
373
374
375

    def doc_to_target(self, doc):
        return " {}".format({0: "no", 1: "yes"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
376

Jason Phang's avatar
Jason Phang committed
377
378
379
380
381
    def construct_requests(self, doc, ctx):
        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
        ll_no, _ = rf.loglikelihood(ctx, ' no')

        return ll_yes, ll_no
382

Jason Phang's avatar
Jason Phang committed
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

        return {
            "acc": acc
        }

    def higher_is_better(self):
        return {
            "acc": True
        }

    def aggregation(self):
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
402
403


404
class SGWinogradSchemaChallenge(HFTask):
Jason Phang's avatar
wsc  
Jason Phang committed
405
406
    # Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
    #       binary version of the task.
Leo Gao's avatar
Leo Gao committed
407
408
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wsc"
Jason Phang's avatar
Jason Phang committed
409
410
411
412
413
414
415
416
417
418
419
420
421

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def training_docs(self):
        if self.has_training_docs():
            if self._training_docs is None:
Jason Phang's avatar
Jason Phang committed
422
                # GPT-3 Paper's format only uses positive examples for fewshot "training"
Jason Phang's avatar
Jason Phang committed
423
424
                self._training_docs = [
                    doc for doc in
Jason Phang's avatar
Jason Phang committed
425
                    self.data["train"]
Jason Phang's avatar
Jason Phang committed
426
427
428
429
430
431
432
433
434
435
                    if doc["label"]
                ]
            return self._training_docs

    def fewshot_description(self):
        return "Final Exam with Answer Key\n" \
           "Instructions: Please carefully read the following passages. " \
           "For each passage, you must identify which noun the pronoun marked in *bold*" \
           " refers to.\n====="

436
    def doc_to_text(self, doc):
Jason Phang's avatar
Jason Phang committed
437
438
439
440
441
442
        raw_passage = doc["text"]
        passage = (
            raw_passage[:doc["span2_index"]]
            + "*{}*".format(doc["span2_text"])
            + raw_passage[doc["span2_index"] + len(doc["span2_text"]):]
        )
Jason Phang's avatar
wsc  
Jason Phang committed
443
        noun = doc["span1_text"]
Jason Phang's avatar
Jason Phang committed
444
445
446
        pronoun = doc["span2_text"]
        text = (
            f"Passage: {passage}\n"
Jason Phang's avatar
wsc  
Jason Phang committed
447
            + f"Question: In the passage above, does the pronoun \"*{pronoun}*\" refer to \"*{noun}*\"?\n"
Jason Phang's avatar
Jason Phang committed
448
449
450
451
            + "Answer:"
        )
        return text

452
    def doc_to_target(self, doc):
Leo Gao's avatar
Leo Gao committed
453
        return " " + yesno(doc['label'])
454

Leo Gao's avatar
Leo Gao committed
455
    def construct_requests(self, doc, ctx):
Jason Phang's avatar
wsc  
Jason Phang committed
456
457
458
459
460

        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
        ll_no, _ = rf.loglikelihood(ctx, ' no')

        return ll_yes, ll_no
461

Jason Phang's avatar
Jason Phang committed
462
    def process_results(self, doc, results):
Jason Phang's avatar
wsc  
Jason Phang committed
463
464
465
466
467
468
469
470
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

        return {
            "acc": acc
        }
Anish Thite's avatar
Anish Thite committed
471

Leo Gao's avatar
Leo Gao committed
472
    def higher_is_better(self):
Jason Phang's avatar
Jason Phang committed
473
474
475
476
477
478
479
480
        return {
            "acc": True
        }

    def aggregation(self):
        return {
            "acc": mean
        }