glue.py 15 KB
Newer Older
Leo Gao's avatar
Leo Gao committed
1
2
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.

Jason Phang's avatar
checkin  
Jason Phang committed
3
import numpy as np
Jason Phang's avatar
Jason Phang committed
4
from scipy.stats import pearsonr, spearmanr
Jason Phang's avatar
checkin  
Jason Phang committed
5
from sklearn.metrics import f1_score, matthews_corrcoef
Jason Phang's avatar
Jason Phang committed
6
from tqdm import auto as tqdm_lib
sdtblck's avatar
sdtblck committed
7
from . common import HFTask, simple_accuracy_metric, yesno
Jason Phang's avatar
checkin  
Jason Phang committed
8

Jason Phang's avatar
Jason Phang committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def get_accuracy_and_f1(preds, golds):
    golds = np.array(golds)
    preds = np.array(preds)
    acc = float((preds == golds).mean())
    f1 = float(f1_score(y_true=golds, y_pred=preds))
    minor = {
        "acc": acc,
        "f1": f1,
        "acc_and_f1": (acc + f1) / 2,
    }
    return {
        "major": minor["acc_and_f1"],
        "minor": minor,
        "higher_is_better": True,
    }


sdtblck's avatar
sdtblck committed
26
class CoLA(HFTask):
sdtblck's avatar
sdtblck committed
27
28
    DATASET_PATH = "glue"
    DATASET_NAME = "cola"
29
    
Jason Phang's avatar
checkin  
Jason Phang committed
30
31
32
33
34
35
36
37
38
    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

Jason Phang's avatar
Jason Phang committed
39
40
41
    def fewshot_description(self):
        return "Does this sentence make sense?:\tTrue or False?"

42
43
44
45
46
    def doc_to_text(self, doc):
        return "Sentence: {}\nAnswer:".format(doc["sentence"])

    def doc_to_target(self, doc):
        return " {}".format({1: "True", 0: "False"}[doc["label"]])
Jason Phang's avatar
checkin  
Jason Phang committed
47

Jason Phang's avatar
Jason Phang committed
48
    def evaluate(self, docs, lm, provide_description, num_fewshot):
49
50
51
52
53
54

        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
Jason Phang's avatar
checkin  
Jason Phang committed
55
56
        golds = [doc["label"] for doc in docs]
        preds = []
Jason Phang's avatar
Jason Phang committed
57
58
59
60
61
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
Jason Phang's avatar
checkin  
Jason Phang committed
62
            )
Jason Phang's avatar
Jason Phang committed
63
            preds.append(lm.loglikelihood(ctx, ' True') > lm.loglikelihood(ctx, ' False'))
Jason Phang's avatar
checkin  
Jason Phang committed
64
65
66
67
68
69
70
71
72
73
        golds = np.array(golds)
        preds = np.array(preds)
        mcc = float(matthews_corrcoef(y_true=golds, y_pred=preds))
        return {
            "major": mcc,
            "minor": {"mcc": mcc},
            "higher_is_better": True,
        }


sdtblck's avatar
sdtblck committed
74
class MNLI(HFTask):
sdtblck's avatar
sdtblck committed
75
76
    DATASET_PATH = "glue"
    DATASET_NAME = "mnli"
Jason Phang's avatar
Jason Phang committed
77

Jason Phang's avatar
checkin  
Jason Phang committed
78
79
80
81
82
83
84
85
86
87
88
    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def validation_docs(self):
        if self.has_validation_docs():
sdtblck's avatar
sdtblck committed
89
            return self.data["validation_matched"]
Jason Phang's avatar
checkin  
Jason Phang committed
90
91
92

    def test_docs(self):
        if self.has_test_docs():
sdtblck's avatar
sdtblck committed
93
            return self.data["test_matched"]
Jason Phang's avatar
checkin  
Jason Phang committed
94

95
96
    def doc_to_text(self, doc):
        return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
Jason Phang's avatar
Jason Phang committed
97
98
            doc["premise"],
            doc["hypothesis"],
Jason Phang's avatar
checkin  
Jason Phang committed
99
        )
100
101
102
103
104
105

    def doc_to_target(self, doc):
        # True = entailment
        # False = contradiction
        # Neither = neutral
        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
Jason Phang's avatar
checkin  
Jason Phang committed
106

Jason Phang's avatar
Jason Phang committed
107
    def evaluate(self, docs, lm, provide_description, num_fewshot):
108
109
110
111
112
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
Jason Phang's avatar
checkin  
Jason Phang committed
113
114
        golds = [doc["label"] for doc in docs]
        preds = []
Jason Phang's avatar
Jason Phang committed
115
116
117
118
119
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
Jason Phang's avatar
checkin  
Jason Phang committed
120
            )
Jason Phang's avatar
Jason Phang committed
121
            probs = np.array([
Jason Phang's avatar
Jason Phang committed
122
123
124
                lm.loglikelihood(ctx, ' True'),
                lm.loglikelihood(ctx, ' Neither'),
                lm.loglikelihood(ctx, ' False'),
Jason Phang's avatar
Jason Phang committed
125
126
            ])
            preds.append(np.argmax(probs))
Jason Phang's avatar
checkin  
Jason Phang committed
127
128
129
        return simple_accuracy_metric(preds=preds, golds=golds)


Jason Phang's avatar
Jason Phang committed
130
131
132
133
134
135
136
137
138
139
140
class MNLIMismatched(MNLI):

    def validation_docs(self):
        if self.has_validation_docs():
            return self.data["validation_mismatched"]

    def test_docs(self):
        if self.has_test_docs():
            return self.data["test_mismatched"]


sdtblck's avatar
sdtblck committed
141
class MRPC(HFTask):
sdtblck's avatar
sdtblck committed
142
143
    DATASET_PATH = "glue"
    DATASET_NAME = "mrpc"
Jason Phang's avatar
Jason Phang committed
144
145
146
147
148
149
150
151
152
153
154
155
156

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
        return "Indicate if both sentences mean the same thing."

157
158
    def doc_to_text(self, doc):
        return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
Jason Phang's avatar
Jason Phang committed
159
160
161
            doc["sentence1"],
            doc["sentence2"],
        )
162
163
164

    def doc_to_target(self, doc):
        return " {}".format(yesno(doc["label"]))
Jason Phang's avatar
Jason Phang committed
165
166

    def evaluate(self, docs, lm, provide_description, num_fewshot):
167
168
169
170
171
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
Jason Phang's avatar
Jason Phang committed
172
173
174
175
176
177
178
179
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
Jason Phang's avatar
Jason Phang committed
180
            preds.append(lm.loglikelihood(ctx, 'yes') > lm.loglikelihood(ctx, 'no'))
Jason Phang's avatar
Jason Phang committed
181
182
        return get_accuracy_and_f1(preds=preds, golds=golds)

183
      
sdtblck's avatar
sdtblck committed
184
class RTE(HFTask):
sdtblck's avatar
sdtblck committed
185
186
    DATASET_PATH = "glue"
    DATASET_NAME = "rte"
Jason Phang's avatar
checkin  
Jason Phang committed
187
188
189
190
191
192
193
194
195
196

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

197
198
    def doc_to_text(self, doc):
        return "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
Jason Phang's avatar
checkin  
Jason Phang committed
199
200
201
            doc["sentence1"],
            doc["sentence2"],
        )
202
203
204
205
206

    def doc_to_target(self, doc):
        # 0 = entailment
        # 1 = not_entailment
        return " {}".format({0: "True", 1: "False"}[doc["label"]])
Jason Phang's avatar
checkin  
Jason Phang committed
207

Jason Phang's avatar
Jason Phang committed
208
    def evaluate(self, docs, lm, provide_description, num_fewshot):
209
210
211
212
213
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
Jason Phang's avatar
Jason Phang committed
214
215
216
217
218
219
220
221
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
Jason Phang's avatar
Jason Phang committed
222
            preds.append(lm.loglikelihood(ctx, ' False') > lm.loglikelihood(ctx, ' True'))
Jason Phang's avatar
Jason Phang committed
223
224
225
        return simple_accuracy_metric(preds=preds, golds=golds)


sdtblck's avatar
sdtblck committed
226
class QNLI(HFTask):
sdtblck's avatar
sdtblck committed
227
228
    DATASET_PATH = "glue"
    DATASET_NAME = "qnli"
Jason Phang's avatar
Jason Phang committed
229
230
231
232
233
234
235
236
237
238

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

239
240
    def doc_to_text(self, doc):
        return "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format(
Jason Phang's avatar
Jason Phang committed
241
242
243
            doc["question"],
            doc["sentence"],
        )
244
245
246
247
248

    def doc_to_target(self, doc):
        # True = entailment
        # False = not entailment
        return " {}".format({0: "Yes", 1: "No"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
249
250

    def evaluate(self, docs, lm, provide_description, num_fewshot):
251
252
253
254
255
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
Jason Phang's avatar
Jason Phang committed
256
257
258
259
260
261
262
263
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
Jason Phang's avatar
Jason Phang committed
264
            preds.append(lm.loglikelihood(ctx, ' False') > lm.loglikelihood(ctx, ' True'))
Jason Phang's avatar
Jason Phang committed
265
266
267
        return simple_accuracy_metric(preds=preds, golds=golds)


sdtblck's avatar
sdtblck committed
268
class QQP(HFTask):
sdtblck's avatar
sdtblck committed
269
270
    DATASET_PATH = "glue"
    DATASET_NAME = "qqp"
Jason Phang's avatar
Jason Phang committed
271
272
273
274
275
276
277
278
279
280
281

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
Jason Phang's avatar
Jason Phang committed
282
        return "Indicate if both questions ask the same thing."
Jason Phang's avatar
Jason Phang committed
283

284
285
    def doc_to_text(self, doc):
        return "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
Jason Phang's avatar
Jason Phang committed
286
287
288
            doc["question1"],
            doc["question2"],
        )
289
290
291

    def doc_to_target(self, doc):
        return " {}".format(yesno(doc["label"]))
Jason Phang's avatar
Jason Phang committed
292
293

    def evaluate(self, docs, lm, provide_description, num_fewshot):
294
295
296
297
298
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
Jason Phang's avatar
Jason Phang committed
299
300
301
302
303
304
305
306
307
308
309
310
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
        return get_accuracy_and_f1(preds=preds, golds=golds)


sdtblck's avatar
sdtblck committed
311
class STSB(HFTask):
sdtblck's avatar
sdtblck committed
312
313
    DATASET_PATH = "glue"
    DATASET_NAME = "stsb"
Jason Phang's avatar
Jason Phang committed
314
315
316
317
318
319
320
321
322
323
324
325
326
327

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
        return "Indicate if both sentences mean the same thing from a scale of 0-5, " \
           "where 5 means identical and 0 means unrelated."

328
329
    def doc_to_text(self, doc):
        return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
Jason Phang's avatar
Jason Phang committed
330
331
332
            doc["sentence1"],
            doc["sentence2"],
        )
333
334
335

    def doc_to_target(self, doc):
        return " {}".format(doc["label"])
Jason Phang's avatar
Jason Phang committed
336
337

    def evaluate(self, docs, lm, provide_description, num_fewshot):
338
339
340
341
342
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
Jason Phang's avatar
checkin  
Jason Phang committed
343
344
        golds = [doc["label"] for doc in docs]
        preds = []
Jason Phang's avatar
Jason Phang committed
345
346
347
348
349
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
Jason Phang's avatar
checkin  
Jason Phang committed
350
            )
Jason Phang's avatar
Jason Phang committed
351
352
353
354
            output = lm.generate(context=ctx, max_gen_length=5).strip()
            first_element = output.split()[0]
            if first_element.isnumeric():
                pred = max(min(float(first_element), 5.0), 0.0)
Jason Phang's avatar
checkin  
Jason Phang committed
355
            else:
Jason Phang's avatar
Jason Phang committed
356
                pred = 2.5
Jason Phang's avatar
Jason Phang committed
357
            import pdb; pdb.set_trace()
Jason Phang's avatar
Jason Phang committed
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
            preds.append(pred)
        pearson_corr = float(pearsonr(preds, golds)[0])
        spearman_corr = float(spearmanr(preds, golds)[0])
        minor = {
            "pearson": pearson_corr,
            "spearmanr": spearman_corr,
            "corr": (pearson_corr + spearman_corr) / 2,
        }
        return {
            "major": minor["corr"],
            "minor": minor,
            "higher_is_better": True,
        }


sdtblck's avatar
sdtblck committed
373
class SST(HFTask):
sdtblck's avatar
sdtblck committed
374
375
    DATASET_PATH = "glue"
    DATASET_NAME = "sst2"
Jason Phang's avatar
Jason Phang committed
376
377
378
379
380
381
382
383
384
385
386
387
388

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
        return "Indicate if each sentence is Positive or Negative."

389
390
    def doc_to_text(self, doc):
        return "sentence:\t{}\t\nanswer:".format(
Jason Phang's avatar
Jason Phang committed
391
392
            doc["sentence"],
        )
393
394
395

    def doc_to_target(self, doc):
        return " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
396
397

    def evaluate(self, docs, lm, provide_description, num_fewshot):
398
399
400
401
402
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
Jason Phang's avatar
Jason Phang committed
403
404
405
406
407
408
409
410
411
412
413
414
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
            preds.append(lm.loglikelihood(ctx, ' Positive') > lm.loglikelihood(ctx, ' Negative'))
        return simple_accuracy_metric(preds=preds, golds=golds)


sdtblck's avatar
sdtblck committed
415
class WNLI(HFTask):
sdtblck's avatar
sdtblck committed
416
417
    DATASET_PATH = "glue"
    DATASET_NAME = "wnli"
418
    
Jason Phang's avatar
Jason Phang committed
419
420
421
422
423
424
425
426
427
    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

428
429
    def doc_to_text(self, doc):
        return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
Jason Phang's avatar
Jason Phang committed
430
431
            doc["sentence1"],
            doc["sentence2"],
Jason Phang's avatar
Jason Phang committed
432
        )
433
434
435
436
437
438

    def doc_to_target(self, doc):
        # True = entailment
        # False = contradiction
        # Neither = neutral
        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
Jason Phang's avatar
Jason Phang committed
439
440

    def evaluate(self, docs, lm, provide_description, num_fewshot):
441
442
443
444
445
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
Jason Phang's avatar
Jason Phang committed
446
447
448
449
450
451
452
453
454
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
            probs = np.array([
Jason Phang's avatar
Jason Phang committed
455
456
457
                lm.loglikelihood(ctx, ' True'),
                lm.loglikelihood(ctx, ' Neither'),
                lm.loglikelihood(ctx, ' False'),
Jason Phang's avatar
Jason Phang committed
458
459
            ])
            preds.append(np.argmax(probs))
Jason Phang's avatar
checkin  
Jason Phang committed
460
        return simple_accuracy_metric(preds=preds, golds=golds)