superglue.py 12.4 KB
Newer Older
Jason Phang's avatar
Jason Phang committed
1
2
3
4
5
"""
To-do:
    - WSC requires free-form generation
    - ReCoRD
"""
Jason Phang's avatar
Jason Phang committed
6
import numpy as np
Jason Phang's avatar
Jason Phang committed
7
8
9
10
11
from . common import HFTask, yesno
from lm_eval.base import rf, mean, acc_all, metric_max_over_ground_truths
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics

Jason Phang's avatar
Jason Phang committed
12

13
class BoolQ(HFTask):
Leo Gao's avatar
Leo Gao committed
14
15
    DATASET_PATH = "super_glue"
    DATASET_NAME = "boolq"
Jason Phang's avatar
Jason Phang committed
16
17
18
19
20
21
22
23
24
25
26

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
27
        # TODO: figure out actual description
Jason Phang's avatar
Jason Phang committed
28
29
        return "Read the following passages and answer each question with a yes or a no."

Leo Gao's avatar
Update  
Leo Gao committed
30
31
32
33
34
    def doc_to_text(self, doc):
        return f"{doc['passage']}\nquestion: {doc['question']}\nanswer: "
    
    def doc_to_target(self, doc):
        return yesno(doc['label']) 
Jason Phang's avatar
Jason Phang committed
35

36
    def construct_requests(self, doc, ctx):
Leo Gao's avatar
Update  
Leo Gao committed
37

38
        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
Jason Phang's avatar
Jason Phang committed
39
        ll_no, _ = rf.loglikelihood(ctx, ' no')
Leo Gao's avatar
Update  
Leo Gao committed
40
41
42
43
44
45
46
47
48

        return ll_yes, ll_no

    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

49
50
51
52
53
54
55
56
57
58
59
60
61
        return {
            "acc": acc
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
62

Jason Phang's avatar
Jason Phang committed
63

64
class CommitmentBank(HFTask):
Leo Gao's avatar
Leo Gao committed
65
66
    DATASET_PATH = "super_glue"
    DATASET_NAME = "cb"
Jason Phang's avatar
Jason Phang committed
67
68
69
70
71
72
73
74
75
76

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

thefazzer's avatar
thefazzer committed
77
    def fewshot_description(self):
Jason Phang's avatar
Jason Phang committed
78
79
        return "Given a premise and a hypothesis, classify whether the author of the premise is committed" \
            "to the truth of the hypothesis. The three possible labels are true, false or neither."
thefazzer's avatar
thefazzer committed
80

81
    def doc_to_text(self, doc):
82
        return "{}\nquestion: {} true, false or neither?\nanswer:".format(
Jason Phang's avatar
Jason Phang committed
83
84
85
            doc["premise"],
            doc["hypothesis"],
        )
86

thefazzer's avatar
thefazzer committed
87
    def doc_to_target(self, doc):
88
89
90
91
        # True = entailment
        # False = contradiction
        # Neither = neutral
        return " {}".format({0: "true", 1: "neither", 2: "false"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
92

thefazzer's avatar
thefazzer committed
93
94
95
96
    def construct_requests(self, doc, ctx):
        ll_true, _ = rf.loglikelihood(ctx, ' true')
        ll_neither, _ = rf.loglikelihood(ctx, ' neither')
        ll_false, _ = rf.loglikelihood(ctx, ' false')
97

thefazzer's avatar
thefazzer committed
98
99
100
101
        return ll_true, ll_neither, ll_false

    def process_results(self, doc, results):
        gold = doc["label"]
thefazzer's avatar
thefazzer committed
102
103
        pred = np.argmax(results)
        acc = 1. if pred == gold else 0.
Jason Phang's avatar
Jason Phang committed
104

thefazzer's avatar
thefazzer committed
105
        return {
thefazzer's avatar
thefazzer committed
106
107
            "acc": acc,
            "f1": (pred, gold)
thefazzer's avatar
thefazzer committed
108
109
110
111
        }
    
    def higher_is_better(self):
        return {
112
113
            "acc": True,
            "f1": True
thefazzer's avatar
thefazzer committed
114
        }
Jason Phang's avatar
Jason Phang committed
115
116
117
118
119
120
121
122
123
124
125

    @classmethod
    def cb_multi_fi(cls, items):
        preds, golds = zip(*items)
        preds = np.array(preds)
        golds = np.array(golds)
        f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
        f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
        f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
        avg_f1 = mean([f11, f12, f13])
        return avg_f1
thefazzer's avatar
thefazzer committed
126
127
128
    
    def aggregation(self):
        return {
thefazzer's avatar
thefazzer committed
129
            "acc": mean,
Jason Phang's avatar
Jason Phang committed
130
            "f1": self.cb_multi_fi,
thefazzer's avatar
thefazzer committed
131
        }
Jason Phang's avatar
Jason Phang committed
132

Jason Phang's avatar
Jason Phang committed
133

134
class Copa(HFTask):
Leo Gao's avatar
Leo Gao committed
135
136
    DATASET_PATH = "super_glue"
    DATASET_NAME = "copa"
Jason Phang's avatar
Jason Phang committed
137
138
139
140
141
142
143
144
145
146

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

thefazzer's avatar
thefazzer committed
147
    def fewshot_description(self):
Jason Phang's avatar
Jason Phang committed
148
149
        return "Given a premise and one alternative with a causal relation to the premise and another without," \
            "choose the more plausible alternative"
thefazzer's avatar
thefazzer committed
150

151
    def doc_to_text(self, doc):
Jason Phang's avatar
Jason Phang committed
152
        # Drop the period
Jason Phang's avatar
Jason Phang committed
153
154
155
156
        connector = {
            "cause": "because",
            "effect": "therefore",
        }[doc["question"]]
157
        return doc["premise"].strip()[:-1] + f" {connector} "
Jason Phang's avatar
Jason Phang committed
158

thefazzer's avatar
thefazzer committed
159
    def doc_to_target(self, doc):
160
161
162
        correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
        # Connect the sentences
        return self.convert_choice(correct_choice)
thefazzer's avatar
thefazzer committed
163
164

    def construct_requests(self, doc, ctx):
thefazzer's avatar
thefazzer committed
165
166
        choice1 = " " + self.convert_choice(doc["choice1"])
        choice2 = " " + self.convert_choice(doc["choice2"])
thefazzer's avatar
thefazzer committed
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
        
        ll_choice1, _ = rf.loglikelihood(ctx, choice1)
        ll_choice2, _ = rf.loglikelihood(ctx, choice2)

        return ll_choice1, ll_choice2

    def process_results(self, doc, results):
        gold = doc["label"]
        pred = np.argmax(results)
        acc = 1. if pred == gold else 0.

        return {
            "acc": acc
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
191
192
193
194
195
196

    @staticmethod
    def convert_choice(choice):
        return choice[0].lower() + choice[1:]


197
class MultiRC(HFTask):
Leo Gao's avatar
Leo Gao committed
198
199
    DATASET_PATH = "super_glue"
    DATASET_NAME = "multirc"
Jason Phang's avatar
multirc  
Jason Phang committed
200
201
202
203
204
205
206
207
208
209
210
211
212

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
        return "READING COMPREHENSION ANSWER KEY"

213
214
215
216
217
    def doc_to_text(self, doc):
        return f"{doc['paragraph']}\n\n{doc['question']}\n"

    def doc_to_target(self, doc):
        return self.format_answer(answer=doc["answer"], label=doc["label"])
Jason Phang's avatar
multirc  
Jason Phang committed
218
219
220
221
222
223

    @staticmethod
    def format_answer(answer, label):
        label_str = "True" if label else "False"
        return f"[{label_str}] {answer}"

thefazzer's avatar
thefazzer committed
224
225
226
227
228
229
230
231
232
233
234
    def construct_requests(self, doc, ctx):
        true_choice = self.format_answer(answer=doc["answer"], label=True)
        false_choice = self.format_answer(answer=doc["answer"], label=False)
        
        ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
        ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')

        return ll_true_choice, ll_false_choice

    def process_results(self, doc, results):
        pred = np.argmax(results)
Jason Phang's avatar
multirc  
Jason Phang committed
235
        return {
thefazzer's avatar
thefazzer committed
236
237
238
239
240
241
242
243
244
245
246
            "acc": (pred, doc)
        }
    
    def higher_is_better(self):
        return {
            "acc": True
        }
    
    def aggregation(self):
        return {
            "acc": acc_all
Jason Phang's avatar
multirc  
Jason Phang committed
247
248
        }

Jason Phang's avatar
Jason Phang committed
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331

class ReCoRD(HFTask):
    DATASET_PATH = "super_glue"
    DATASET_NAME = "record"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def training_docs(self):
        # In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
        # Each doc consists of multiple answer candidates, each of which is scored yes/no.
        # Hence, we one "doc" for each (context + passage, answer) pair.
        # Moreover, we only use the correct answers for context packing
        # (This is not an issue for evaluation, where we can directly score multiple candidates at once).
        if self.has_training_docs():
            if self._training_docs is None:
                self._training_docs = []
                for doc in self.data["train"]:
                    for entity in list(set(doc["entities"])):
                        self._training_docs.append({
                            "passage": doc["passage"],
                            "query": doc["query"],
                            "entity": entity,
                            "label": entity in doc["answers"],
                        })
            return self._training_docs

    def doc_to_text(self, doc):
        initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
        text = initial_text + "\n\n"
        for highlight in highlights:
            text += f"  - {highlight}.\n"
        return text

    @classmethod
    def format_answer(cls, query, entity):
        return f'  - {query}'.replace("@placeholder", entity)

    def doc_to_target(self, doc):
        return self.format_answer(query=doc["query"], entity=doc["entity"])

    def construct_requests(self, doc, ctx):
        requests = [
            rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity))
            for entity in doc["entities"]
        ]
        return requests

    def process_results(self, doc, results):
        # ReCoRD's evaluation is actually deceptively simple:
        # - Pick the maximum likelihood prediction entity
        # - Evaluate the accuracy and token F1 PER EXAMPLE
        # - Average over all examples
        max_idx = np.argmax(np.array(results))
        prediction = doc["entities"][max_idx]
        gold_label_set = list(set(doc["answers"]))
        f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
        em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set)

        return {
            "f1": f1,
            "em": em,
        }

    def higher_is_better(self):
        return {
            "f1": True,
            "em": True,
        }

    def aggregation(self):
        return {
            "f1": mean,
            "em": mean,
        }


332
class WordsInContext(HFTask):
Leo Gao's avatar
Leo Gao committed
333
334
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wic"
Jason Phang's avatar
Jason Phang committed
335
336
337
338
339
340
341
342
343
344

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

345
    def doc_to_text(self, doc):
346
        return "{}\n{}\nQuestion: Is the word '{}' used in the same way in the" \
Jason Phang's avatar
Jason Phang committed
347
348
349
350
351
               " two sentences above?\nanswer:".format(
                    doc["sentence1"],
                    doc["sentence2"],
                    doc["sentence1"][doc["start1"]:doc["end1"]],
                )
352
353
354

    def doc_to_target(self, doc):
        return " {}".format({0: "no", 1: "yes"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
355

Jason Phang's avatar
Jason Phang committed
356
357
358
359
360
    def construct_requests(self, doc, ctx):
        ll_yes, _ = rf.loglikelihood(ctx, ' yes')
        ll_no, _ = rf.loglikelihood(ctx, ' no')

        return ll_yes, ll_no
361

Jason Phang's avatar
Jason Phang committed
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
    def process_results(self, doc, results):
        ll_yes, ll_no = results
        gold = doc["label"]

        acc = 1. if (ll_yes > ll_no) == gold else 0.

        return {
            "acc": acc
        }

    def higher_is_better(self):
        return {
            "acc": True
        }

    def aggregation(self):
        return {
            "acc": mean
        }
Jason Phang's avatar
Jason Phang committed
381
382


383
class SGWinogradSchemaChallenge(HFTask):
Leo Gao's avatar
Leo Gao committed
384
385
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wsc"
Jason Phang's avatar
Jason Phang committed
386
387
388
389
390
391
392
393
394
395
396
397
398

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def training_docs(self):
        if self.has_training_docs():
            if self._training_docs is None:
Jason Phang's avatar
Jason Phang committed
399
                # GPT-3 Paper's format only uses positive examples for fewshot "training"
Jason Phang's avatar
Jason Phang committed
400
401
                self._training_docs = [
                    doc for doc in
Jason Phang's avatar
Jason Phang committed
402
                    self.data["train"]
Jason Phang's avatar
Jason Phang committed
403
404
405
406
407
408
409
410
411
412
                    if doc["label"]
                ]
            return self._training_docs

    def fewshot_description(self):
        return "Final Exam with Answer Key\n" \
           "Instructions: Please carefully read the following passages. " \
           "For each passage, you must identify which noun the pronoun marked in *bold*" \
           " refers to.\n====="

413
    def doc_to_text(self, doc):
Jason Phang's avatar
Jason Phang committed
414
415
416
417
418
419
420
421
422
423
424
425
426
427
        raw_passage = doc["text"]
        passage = (
            raw_passage[:doc["span2_index"]]
            + "*{}*".format(doc["span2_text"])
            + raw_passage[doc["span2_index"] + len(doc["span2_text"]):]
        )
        pronoun = doc["span2_text"]
        text = (
            f"Passage: {passage}\n"
            + f"Question: In the passage above, what does the pronoun \"*{pronoun}*\" refer to?\n"
            + "Answer:"
        )
        return text

428
429
430
    def doc_to_target(self, doc):
        return " {}".format(doc["span1_text"])

Leo Gao's avatar
Leo Gao committed
431
    def construct_requests(self, doc, ctx):
Jason Phang's avatar
Jason Phang committed
432
433
        # Evaluate probability of generating answer based on span1_text (coref target)
        raise NotImplementedError("requires free-form generation")
434

Jason Phang's avatar
Jason Phang committed
435
436
437
    def process_results(self, doc, results):
        # Evaluate probability of generating answer based on span1_text (coref target)
        raise NotImplementedError("requires evaluation from free-form generation")
Anish Thite's avatar
Anish Thite committed
438

Leo Gao's avatar
Leo Gao committed
439
    def higher_is_better(self):
Jason Phang's avatar
Jason Phang committed
440
441
442
443
444
445
446
447
        return {
            "acc": True
        }

    def aggregation(self):
        return {
            "acc": mean
        }