Merge pull request #50 from cfoster0/winograd

Winograd changes

Merge pull request #50 from cfoster0/winograd
Winograd changes
b0db32bc · Stella Biderman · GitHub · b8a3edaf · 05bd05e9 · b0db32bc
Unverified Commit b0db32bc authored Oct 23, 2020 by Stella Biderman Committed by GitHub Oct 23, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 87 additions and 2 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +3 -1

lm_eval/tasks/superglue.py lm_eval/tasks/superglue.py +1 -1

lm_eval/tasks/wsc273.py lm_eval/tasks/wsc273.py +83 -0

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -4,6 +4,7 @@ from . import arc
 from . import race
 from . import webqs
 from . import anli
+from . import wsc273
 from . import winogrande
 from . import quac
 from . import hellaswag
@@ -27,7 +28,7 @@ TASK_REGISTRY = {
    "copa": superglue.Copa,
    "multirc": superglue.MultiRC,
    "wic": superglue.WordsInContext,
-    "wsc": superglue.WinogradSchemaChallenge,
+    "wsc": superglue.SGWinogradSchemaChallenge,
    # Order by benchmark/genre?
    "arc_easy": arc.ARCEasy,
    "arc_challenge": arc.ARCChallenge,
@@ -37,6 +38,7 @@ TASK_REGISTRY = {
    "squad": squad.SQuAD,
    "race": race.RACE,
    "webqs": webqs.WebQs,
+    "wsc273": wsc273.WinogradSchemaChallenge273,
    "winogrande": winogrande.Winogrande,
    "anli_r1": anli.ANLIRound1,
    "anli_r2": anli.ANLIRound2,

--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -218,7 +218,7 @@ class WordsInContext(HFTask):
        return simple_accuracy_metric(preds=preds, golds=golds)
-class WinogradSchemaChallenge(HFTask):
+class SGWinogradSchemaChallenge(HFTask):
    DATASET_PATH = "super_glue"
    DATASET_NAME = "wsc"

--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
+import json
+import random
+import os
+from lm_eval.base import Dataset
+from ..utils import sh
+class WinogradSchemaChallenge273(Dataset):    
+    def __init__(self):
+        super().__init__()
+    def download(self):
+        if not os.path.exists('data/wsc273'):
+            sh("""
+                mkdir -p data/wsc273 
+                wget https://git.cse.msu.edu/bakerb15/nlp-final-project/raw/master/Winogard/reproduce/commonsense_test/wsc273.json -O data/wsc273/wsc273.json
+                """)
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return False
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        return []
+    def validation_docs(self):
+        return []
+    def test_docs(self):
+        myjson = json.load(open('data/wsc273/wsc273.json'))
+        return self.load_doc(myjson)
+    def fewshot_description(self):
+        # This format is ONLY for the purposes of deduplication. For the task evaluation, we'll need to find a new strategy,
+        # to meet the needs of this particular task.
+        return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
+    def load_doc(self, myjson):
+        docs = []
+        for i in range(0, 273 * 2, 2):
+            item1 = myjson[i]
+            item2 = myjson[i+1]
+            if item1['question_id'] != item2['question_id']:
+                raise ValueError("WSC273 has missing completion pair.")
+            question_id = item1['question_id']
+            if item1['correctness'] == True:
+                doc = {
+                    'id': question_id,
+                    'completions': {
+                        'T': item1['substitution'],
+                        'F': item2['substitution'],
+                    },
+                }
+            if item2['correctness'] == True:
+                doc = {
+                    'id': question_id,
+                    'completions': {
+                        'F': item1['substitution'],
+                        'T': item2['substitution'],
+                    },
+                }
+            docs.append(doc)
+        return docs
+    def doc_to_text(self, doc, include_target=True):
+        # WSC273 is currently only writing out full examples. Partial evaluation needs implementing.
+        text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.'
+        return text
+    def evaluate(self, docs, lm):
+        # TODO: Write evaluation function
+        raise NotImplementedError()