superglue.py 13.1 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
2
3
4
5
"""
To-do:
    - WSC requires free-form generation
    - ReCoRD
"""
Jason Phang's avatar
Jason Phang committed
6
import numpy as np
7
8
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
Jason Phang's avatar
Jason Phang committed
9
from . common import HFTask, yesno
&'s avatar
& committed
10
11
from lm_eval.base import rf
from ..metrics import mean, acc_all, metric_max_over_ground_truths
Leo Gao's avatar
Fix  
Leo Gao committed
12
from ..utils import general_detokenize
Jason Phang's avatar
Jason Phang committed
13

Jason Phang's avatar
Jason Phang committed
14

15
class BoolQ(HFTask):
Leo Gao's avatar
Leo Gao committed
16
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
17
18
    DATASET_PATH = "super_glue"
    DATASET_NAME = "boolq"
Jason Phang's avatar
Jason Phang committed
19
20
21
22
23
24
25
26

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
27
        return False
Jason Phang's avatar
Jason Phang committed
28
29

    def fewshot_description(self):
30
        # TODO: figure out actual description
Jason Phang's avatar
Jason Phang committed
31
32
        return "Read the following passages and answer each question with a yes or a no."

Leo Gao's avatar
Update  
Leo Gao committed
33
    def doc_to_text(self, doc):
Leo Gao's avatar
Leo Gao committed
34
        return f"{doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
Leo Gao's avatar
Update  
Leo Gao committed
35
36
    
    def doc_to_target(self, doc):
37
        return " " + yesno(doc['label']) 
Jason Phang's avatar
Jason Phang committed
38

39
    def construct_requests(self, doc, ctx):
Leo Gao's avatar
Update  
Leo Gao committed
40

41
        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
Jason Phang's avatar
Jason Phang committed
42
        ll_no, _ = rf.loglikelihood(ctx, ' no')
Leo Gao's avatar
Update  
Leo Gao committed
43
44
45
46
47
48
49
50
51

        return ll_yes, ll_no

    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

52
53
54
55
56
57
58
59
60
61
62
63
64
        return {
            "acc": acc
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
65

Jason Phang's avatar
Jason Phang committed
66

67
class CommitmentBank(HFTask):
Leo Gao's avatar
Leo Gao committed
68
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
69
70
    DATASET_PATH = "super_glue"
    DATASET_NAME = "cb"
Jason Phang's avatar
Jason Phang committed
71
72
73
74
75
76
77
78

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
79
        return False
Jason Phang's avatar
Jason Phang committed
80

thefazzer's avatar
thefazzer committed
81
    def fewshot_description(self):
82
        # TODO: figure out actual description
Jason Phang's avatar
Jason Phang committed
83
84
        return "Given a premise and a hypothesis, classify whether the author of the premise is committed" \
            "to the truth of the hypothesis. The three possible labels are true, false or neither."
thefazzer's avatar
thefazzer committed
85

86
    def doc_to_text(self, doc):
Leo Gao's avatar
Leo Gao committed
87
        return "{}\nQuestion: {}. True, False or Neither?\nAnswer:".format(
Jason Phang's avatar
Jason Phang committed
88
89
90
            doc["premise"],
            doc["hypothesis"],
        )
91

thefazzer's avatar
thefazzer committed
92
    def doc_to_target(self, doc):
93
94
95
        # True = entailment
        # False = contradiction
        # Neither = neutral
Leo Gao's avatar
Leo Gao committed
96
        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
97

thefazzer's avatar
thefazzer committed
98
    def construct_requests(self, doc, ctx):
Leo Gao's avatar
Leo Gao committed
99
100
101
        ll_true, _ = rf.loglikelihood(ctx, ' True')
        ll_neither, _ = rf.loglikelihood(ctx, ' Neither')
        ll_false, _ = rf.loglikelihood(ctx, ' False')
102

thefazzer's avatar
thefazzer committed
103
104
105
106
        return ll_true, ll_neither, ll_false

    def process_results(self, doc, results):
        gold = doc["label"]
thefazzer's avatar
thefazzer committed
107
108
        pred = np.argmax(results)
        acc = 1. if pred == gold else 0.
Jason Phang's avatar
Jason Phang committed
109

thefazzer's avatar
thefazzer committed
110
        return {
thefazzer's avatar
thefazzer committed
111
112
            "acc": acc,
            "f1": (pred, gold)
thefazzer's avatar
thefazzer committed
113
114
115
116
        }
    
    def higher_is_better(self):
        return {
117
118
            "acc": True,
            "f1": True
thefazzer's avatar
thefazzer committed
119
        }
Jason Phang's avatar
Jason Phang committed
120
121
122
123
124
125
126
127
128
129
130

    @classmethod
    def cb_multi_fi(cls, items):
        preds, golds = zip(*items)
        preds = np.array(preds)
        golds = np.array(golds)
        f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
        f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
        f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
        avg_f1 = mean([f11, f12, f13])
        return avg_f1
thefazzer's avatar
thefazzer committed
131
132
133
    
    def aggregation(self):
        return {
thefazzer's avatar
thefazzer committed
134
            "acc": mean,
Jason Phang's avatar
Jason Phang committed
135
            "f1": self.cb_multi_fi,
thefazzer's avatar
thefazzer committed
136
        }
Jason Phang's avatar
Jason Phang committed
137

Jason Phang's avatar
Jason Phang committed
138

139
class Copa(HFTask):
Leo Gao's avatar
Leo Gao committed
140
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
141
142
    DATASET_PATH = "super_glue"
    DATASET_NAME = "copa"
Jason Phang's avatar
Jason Phang committed
143
144
145
146
147
148
149
150

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
151
        return False
Jason Phang's avatar
Jason Phang committed
152

thefazzer's avatar
thefazzer committed
153
    def fewshot_description(self):
154
        # TODO: figure out actual description
Jason Phang's avatar
Jason Phang committed
155
156
        return "Given a premise and one alternative with a causal relation to the premise and another without," \
            "choose the more plausible alternative"
thefazzer's avatar
thefazzer committed
157

158
    def doc_to_text(self, doc):
Jason Phang's avatar
Jason Phang committed
159
        # Drop the period
Jason Phang's avatar
Jason Phang committed
160
161
162
163
        connector = {
            "cause": "because",
            "effect": "therefore",
        }[doc["question"]]
164
        return doc["premise"].strip()[:-1] + f" {connector}"
Jason Phang's avatar
Jason Phang committed
165

thefazzer's avatar
thefazzer committed
166
    def doc_to_target(self, doc):
167
168
        correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
        # Connect the sentences
169
        return " " + self.convert_choice(correct_choice)
thefazzer's avatar
thefazzer committed
170
171

    def construct_requests(self, doc, ctx):
thefazzer's avatar
thefazzer committed
172
173
        choice1 = " " + self.convert_choice(doc["choice1"])
        choice2 = " " + self.convert_choice(doc["choice2"])
thefazzer's avatar
thefazzer committed
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
        
        ll_choice1, _ = rf.loglikelihood(ctx, choice1)
        ll_choice2, _ = rf.loglikelihood(ctx, choice2)

        return ll_choice1, ll_choice2

    def process_results(self, doc, results):
        gold = doc["label"]
        pred = np.argmax(results)
        acc = 1. if pred == gold else 0.

        return {
            "acc": acc
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
198
199
200
201
202
203

    @staticmethod
    def convert_choice(choice):
        return choice[0].lower() + choice[1:]


204
class MultiRC(HFTask):
Leo Gao's avatar
Leo Gao committed
205
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
206
207
    DATASET_PATH = "super_glue"
    DATASET_NAME = "multirc"
Jason Phang's avatar
multirc  
Jason Phang committed
208
209
210
211
212
213
214
215

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
216
        return False
Jason Phang's avatar
multirc  
Jason Phang committed
217
218

    def fewshot_description(self):
219
        # TODO: figure out actual description
Jason Phang's avatar
multirc  
Jason Phang committed
220
221
        return "READING COMPREHENSION ANSWER KEY"

222
    def doc_to_text(self, doc):
Leo Gao's avatar
Leo Gao committed
223
        return f"{doc['paragraph']}\nQuestion: {doc['question']}\nAnswer:"
224
225

    def doc_to_target(self, doc):
Leo Gao's avatar
Leo Gao committed
226
        return " " + self.format_answer(answer=doc["answer"], label=doc["label"])
Jason Phang's avatar
multirc  
Jason Phang committed
227
228
229

    @staticmethod
    def format_answer(answer, label):
Leo Gao's avatar
Fix  
Leo Gao committed
230
        label_str = "yes" if label else "no"
Leo Gao's avatar
Leo Gao committed
231
        return f"{label_str}, {answer}"
Jason Phang's avatar
multirc  
Jason Phang committed
232

thefazzer's avatar
thefazzer committed
233
234
235
236
237
238
239
240
241
242
243
    def construct_requests(self, doc, ctx):
        true_choice = self.format_answer(answer=doc["answer"], label=True)
        false_choice = self.format_answer(answer=doc["answer"], label=False)
        
        ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
        ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')

        return ll_true_choice, ll_false_choice

    def process_results(self, doc, results):
        pred = np.argmax(results)
Jason Phang's avatar
multirc  
Jason Phang committed
244
        return {
thefazzer's avatar
thefazzer committed
245
246
247
248
249
250
251
252
253
254
255
            "acc": (pred, doc)
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": acc_all
Jason Phang's avatar
multirc  
Jason Phang committed
256
257
        }

Jason Phang's avatar
Jason Phang committed
258
259

class ReCoRD(HFTask):
Leo Gao's avatar
Leo Gao committed
260
    VERSION = 0
Jason Phang's avatar
Jason Phang committed
261
262
263
264
265
266
267
268
269
270
    DATASET_PATH = "super_glue"
    DATASET_NAME = "record"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
Leo Gao's avatar
Leo Gao committed
271
        return False
Jason Phang's avatar
Jason Phang committed
272

273
274
275
276
    def fewshot_description(self):
        # TODO: figure out actual description
        return ""

Jason Phang's avatar
Jason Phang committed
277
278
279
    def training_docs(self):
        # In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
        # Each doc consists of multiple answer candidates, each of which is scored yes/no.
280
281
282
        if self._training_docs is None:
            self._training_docs = []
            for doc in self.data["train"]:
Jason Phang's avatar
Jason Phang committed
283
                self._training_docs.append(self._process_doc(doc))
284
285
286
        return self._training_docs

    def validation_docs(self):
Jason Phang's avatar
Jason Phang committed
287
288
289
290
291
292
293
294
295
296
297
298
        # See: training_docs
        for doc in self.data["validation"]:
            yield self._process_doc(doc)

    @classmethod
    def _process_doc(cls, doc):
        return {
            "passage": doc["passage"],
            "query": doc["query"],
            "entities": sorted(list(set(doc["entities"]))),
            "answers": sorted(list(set(doc["answers"]))),
        }
Jason Phang's avatar
Jason Phang committed
299
300
301
302
303
304
305
306
307
308
309
310
311

    def doc_to_text(self, doc):
        initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
        text = initial_text + "\n\n"
        for highlight in highlights:
            text += f"  - {highlight}.\n"
        return text

    @classmethod
    def format_answer(cls, query, entity):
        return f'  - {query}'.replace("@placeholder", entity)

    def doc_to_target(self, doc):
Jason Phang's avatar
Jason Phang committed
312
313
        # We only output the first correct entity in a doc
        return self.format_answer(query=doc["query"], entity=doc["answers"][0])
Jason Phang's avatar
Jason Phang committed
314
315
316
317

    def construct_requests(self, doc, ctx):
        requests = [
            rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity))
Jason Phang's avatar
Jason Phang committed
318
            for entity in doc["entities"]
Jason Phang's avatar
Jason Phang committed
319
320
321
322
323
324
325
326
        ]
        return requests

    def process_results(self, doc, results):
        # ReCoRD's evaluation is actually deceptively simple:
        # - Pick the maximum likelihood prediction entity
        # - Evaluate the accuracy and token F1 PER EXAMPLE
        # - Average over all examples
Jason Phang's avatar
Jason Phang committed
327
        max_idx = np.argmax(np.array([result[0] for result in results]))
Leo Gao's avatar
Leo Gao committed
328

Jason Phang's avatar
Jason Phang committed
329
        prediction = doc["entities"][max_idx]
Jason Phang's avatar
Jason Phang committed
330
        gold_label_set = doc["answers"]
Jason Phang's avatar
Jason Phang committed
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
        f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
        em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set)

        return {
            "f1": f1,
            "em": em,
        }

    def higher_is_better(self):
        return {
            "f1": True,
            "em": True,
        }

    def aggregation(self):
        return {
            "f1": mean,
            "em": mean,
        }


352
class WordsInContext(HFTask):
Leo Gao's avatar
Leo Gao committed
353
    VERSION = 0
Leo Gao's avatar
Leo Gao committed
354
355
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wic"
Jason Phang's avatar
Jason Phang committed
356
357
358
359
360
361
362
363

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
364
        return False
Jason Phang's avatar
Jason Phang committed
365

366
367
368
369
    def fewshot_description(self):
        # TODO: figure out actual description
        return ""

370
    def doc_to_text(self, doc):
Leo Gao's avatar
Leo Gao committed
371
372
        return "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the" \
               " two sentences above?\nAnswer:".format(
Jason Phang's avatar
Jason Phang committed
373
374
375
376
                    doc["sentence1"],
                    doc["sentence2"],
                    doc["sentence1"][doc["start1"]:doc["end1"]],
                )
377
378
379

    def doc_to_target(self, doc):
        return " {}".format({0: "no", 1: "yes"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
380

Jason Phang's avatar
Jason Phang committed
381
382
383
384
385
    def construct_requests(self, doc, ctx):
        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
        ll_no, _ = rf.loglikelihood(ctx, ' no')

        return ll_yes, ll_no
386

Jason Phang's avatar
Jason Phang committed
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

        return {
            "acc": acc
        }

    def higher_is_better(self):
        return {
            "acc": True
        }

    def aggregation(self):
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
406
407


408
class SGWinogradSchemaChallenge(HFTask):
Leo Gao's avatar
Leo Gao committed
409
    VERSION = 0
Jason Phang's avatar
wsc  
Jason Phang committed
410
411
    # Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
    #       binary version of the task.
Leo Gao's avatar
Leo Gao committed
412
413
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wsc"
Jason Phang's avatar
Jason Phang committed
414
415
416
417
418
419
420
421

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
422
        return False
Jason Phang's avatar
Jason Phang committed
423
424
425
426

    def training_docs(self):
        if self.has_training_docs():
            if self._training_docs is None:
Jason Phang's avatar
Jason Phang committed
427
                # GPT-3 Paper's format only uses positive examples for fewshot "training"
Jason Phang's avatar
Jason Phang committed
428
429
                self._training_docs = [
                    doc for doc in
Jason Phang's avatar
Jason Phang committed
430
                    self.data["train"]
Jason Phang's avatar
Jason Phang committed
431
432
433
434
435
436
437
438
439
440
                    if doc["label"]
                ]
            return self._training_docs

    def fewshot_description(self):
        return "Final Exam with Answer Key\n" \
           "Instructions: Please carefully read the following passages. " \
           "For each passage, you must identify which noun the pronoun marked in *bold*" \
           " refers to.\n====="

441
    def doc_to_text(self, doc):
Jason Phang's avatar
Jason Phang committed
442
        raw_passage = doc["text"]
Jonathan Tow's avatar
Jonathan Tow committed
443
444
445
        # NOTE: HuggingFace span indices are word-based not character-based.
        pre = " ".join(raw_passage.split()[:doc["span2_index"]])
        post = raw_passage[len(pre) + len(doc["span2_text"]) + 1:]
Leo Gao's avatar
Leo Gao committed
446
        passage = general_detokenize(pre + " *{}*".format(doc['span2_text']) + post)
Jason Phang's avatar
wsc  
Jason Phang committed
447
        noun = doc["span1_text"]
Jason Phang's avatar
Jason Phang committed
448
449
450
        pronoun = doc["span2_text"]
        text = (
            f"Passage: {passage}\n"
Jason Phang's avatar
wsc  
Jason Phang committed
451
            + f"Question: In the passage above, does the pronoun \"*{pronoun}*\" refer to \"*{noun}*\"?\n"
Jason Phang's avatar
Jason Phang committed
452
453
454
455
            + "Answer:"
        )
        return text

456
    def doc_to_target(self, doc):
Leo Gao's avatar
Leo Gao committed
457
        return " " + yesno(doc['label'])
458

Leo Gao's avatar
Leo Gao committed
459
    def construct_requests(self, doc, ctx):
Jason Phang's avatar
wsc  
Jason Phang committed
460
461
462
463
464

        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
        ll_no, _ = rf.loglikelihood(ctx, ' no')

        return ll_yes, ll_no
465

Jason Phang's avatar
Jason Phang committed
466
    def process_results(self, doc, results):
Jason Phang's avatar
wsc  
Jason Phang committed
467
468
469
470
471
472
473
474
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

        return {
            "acc": acc
        }
Anish Thite's avatar
Anish Thite committed
475

Leo Gao's avatar
Leo Gao committed
476
    def higher_is_better(self):
Jason Phang's avatar
Jason Phang committed
477
478
479
480
481
482
483
484
        return {
            "acc": True
        }

    def aggregation(self):
        return {
            "acc": mean
        }