Merge pull request #306 from jon-tow/add-swag

Add `SWAG`

Merge pull request #306 from jon-tow/add-swag
Add `SWAG`
827e2d51 · Stella Biderman · GitHub · 1da3d719 · 9b6c45a8 · 827e2d51
Unverified Commit 827e2d51 authored Apr 28, 2022 by Stella Biderman Committed by GitHub Apr 28, 2022
4 changed files
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -15,6 +15,7 @@ from . import wsc273
 from . import winogrande
 from . import quac
 from . import hellaswag
+from . import swag
 from . import openbookqa
 from . import squad
 from . import naturalqs
@@ -136,6 +137,7 @@ TASK_REGISTRY = {
    # "quac": quac.QuAC, # not implemented yet
    "logiqa": logiqa.LogiQA,
    "hellaswag": hellaswag.HellaSwag,
+    "swag": swag.SWAG,
    "openbookqa": openbookqa.OpenBookQA,
    "squad2": squad.SQuAD2,
    "race": race.RACE,

--- a/lm_eval/tasks/swag.py
+++ b/lm_eval/tasks/swag.py
+"""
+SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference
+https://arxiv.org/pdf/1808.05326.pdf
+SWAG (Situations With Adversarial Generations) is an adversarial dataset
+that consists of 113k multiple choice questions about grounded situations. Each
+question is a video caption from LSMDC or ActivityNet Captions, with four answer
+choices about what might happen next in the scene. The correct answer is the
+(real) video caption for the next event in the video; the three incorrect
+answers are adversarially generated and human verified, so as to fool machines
+but not humans.
+Homepage: https://rowanzellers.com/swag/
+"""
+from lm_eval.base import MultipleChoiceTask
+_CITATION = """
+@inproceedings{zellers2018swagaf,
+    title={SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference},
+    author={Zellers, Rowan and Bisk, Yonatan and Schwartz, Roy and Choi, Yejin},
+    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
+    year={2018}
+}
+"""
+class SWAG(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = "swag"
+    DATASET_NAME = "regular"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
+        return self._training_docs
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+    def _process_doc(self, doc):
+        out_doc = {
+            "query": doc["startphrase"],
+            "choices": [doc["ending0"], doc["ending1"], doc["ending2"], doc["ending3"]],
+            "gold": int(doc["label"]),
+        }
+        return out_doc
+    def doc_to_text(self, doc):
+        return doc["query"]
--- a/tests/testdata/swag-v0-loglikelihood
+++ b/tests/testdata/swag-v0-loglikelihood
+be4fcbad876124c4ba3c71970538a97fec0e36a9cc677c70b6c9243a7bcee0ec
\ No newline at end of file
--- a/tests/testdata/swag-v0-res.json
+++ b/tests/testdata/swag-v0-res.json
+{"results": {"swag": {"acc": 0.2482255323402979, "acc_norm": 0.24882535239428172, "acc_norm_stderr": 0.00305666959496067, "acc_stderr": 0.003054201832644171}}, "versions": {"swag": 0}}
\ No newline at end of file