Renamed WSC to make distinction between SuperGLUE Winograd Schemas...

Renamed WSC to make distinction between SuperGLUE Winograd Schemas (SGWinogradSchemaChallenge) and WSC273 (WinogradSchemaChallenge273) clearer. Also, added WSC273.

Renamed WSC to make distinction between SuperGLUE Winograd Schemas...
Renamed WSC to make distinction between SuperGLUE Winograd Schemas (SGWinogradSchemaChallenge) and WSC273 (WinogradSchemaChallenge273) clearer. Also, added WSC273.
05bd05e9 · Charles Foster · ca24b52f · 05bd05e9 · 05bd05e9 · 05bd05e9
Commit 05bd05e9 authored Oct 23, 2020 by Charles Foster
Hide whitespace changes
Inline Side-by-side

Showing with 87 additions and 2 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +3 -1

lm_eval/tasks/superglue.py lm_eval/tasks/superglue.py +1 -1

lm_eval/tasks/wsc273.py lm_eval/tasks/wsc273.py +83 -0

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -4,6 +4,7 @@ from . import arc
 from . import race
 from . import webqs
 from . import anli
+from . import wsc273
 from . import winogrande
 from . import quac
 from . import hellaswag
@@ -27,7 +28,7 @@ TASK_REGISTRY = {
    "copa": superglue.Copa,
    "multirc": superglue.MultiRC,
    "wic": superglue.WordsInContext,
-    "wsc": superglue.WinogradSchemaChallenge,
+    "wsc": superglue.SGWinogradSchemaChallenge,
    # Order by benchmark/genre?
    "arc_easy": arc.ARCEasy,
    "arc_challenge": arc.ARCChallenge,
@@ -37,6 +38,7 @@ TASK_REGISTRY = {
    "squad": squad.SQuAD,
    "race": race.RACE,
    "webqs": webqs.WebQs,
+    "wsc273": wsc273.WinogradSchemaChallenge273,
    "winogrande": winogrande.Winogrande,
    "anli_r1": anli.ANLIRound1,
    "anli_r2": anli.ANLIRound2,

--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -218,7 +218,7 @@ class WordsInContext(HFTask):
        return simple_accuracy_metric(preds=preds, golds=golds)
-class WinogradSchemaChallenge(HFTask):
+class SGWinogradSchemaChallenge(HFTask):
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wsc"

--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
+import json
+import random
+import os
+from lm_eval.base import Dataset
+from ..utils import sh
+class WinogradSchemaChallenge273(Dataset):    
+    def __init__(self):
+        super().__init__()
+    def download(self):
+        if not os.path.exists('data/wsc273'):
+            sh("""
+                mkdir -p data/wsc273 
+                wget https://git.cse.msu.edu/bakerb15/nlp-final-project/raw/master/Winogard/reproduce/commonsense_test/wsc273.json -O data/wsc273/wsc273.json
+                """)
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return False
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        return []
+    def validation_docs(self):
+        return []
+    def test_docs(self):
+        myjson = json.load(open('data/wsc273/wsc273.json'))
+        return self.load_doc(myjson)
+    def fewshot_description(self):
+        # This format is ONLY for the purposes of deduplication. For the task evaluation, we'll need to find a new strategy,
+        # to meet the needs of this particular task.
+        return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
+    def load_doc(self, myjson):
+        docs = []
+        for i in range(0, 273 * 2, 2):
+            item1 = myjson[i]
+            item2 = myjson[i+1]
+            if item1['question_id'] != item2['question_id']:
+                raise ValueError("WSC273 has missing completion pair.")
+            question_id = item1['question_id']
+            if item1['correctness'] == True:
+                doc = {
+                    'id': question_id,
+                    'completions': {
+                        'T': item1['substitution'],
+                        'F': item2['substitution'],
+                    },
+                }
+            if item2['correctness'] == True:
+                doc = {
+                    'id': question_id,
+                    'completions': {
+                        'F': item1['substitution'],
+                        'T': item2['substitution'],
+                    },
+                }
+            docs.append(doc)
+        return docs
+    def doc_to_text(self, doc, include_target=True):
+        # WSC273 is currently only writing out full examples. Partial evaluation needs implementing.
+        text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.'
+        return text
+    def evaluate(self, docs, lm):
+        # TODO: Write evaluation function
+        raise NotImplementedError()