feat(tasks): Add SIQA

346e2c22 · jon-tow · guac · df3da98c · 346e2c22 · 346e2c22
Commit 346e2c22 authored Jul 22, 2023 by jon-tow Committed by guac Jul 22, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 68 additions and 0 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +2 -0

lm_eval/tasks/siqa.py lm_eval/tasks/siqa.py +66 -0

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -23,6 +23,7 @@ from . import naturalqs
 from . import sat
 from . import arithmetic
 from . import lambada
+from . import siqa
 from . import piqa
 from . import prost
 from . import mc_taco
@@ -123,6 +124,7 @@ TASK_REGISTRY = {
    "lambada_standard": lambada.LambadaStandard,
    "lambada_openai_cloze": lambada_cloze.LambadaOpenAICloze,
    "lambada_standard_cloze": lambada_cloze.LambadaStandardCloze,
+    "siqa": siqa.SIQA,
    # multilingual lambada
    **lambada_multilingual.construct_tasks(),
    "wikitext": wikitext.WikiText,

--- a/lm_eval/tasks/siqa.py
+++ b/lm_eval/tasks/siqa.py
+"""
+SOCIAL IQA: Commonsense Reasoning about Social Interactions
+https://aclanthology.org/D19-1454.pdf
+
+Social IQa: Social Interaction QA, is a question-answering benchmark for testing
+social commonsense intelligence. Contrary to many prior benchmarks that focus on
+physical or taxonomic knowledge, Social IQa focuses on reasoning about people’s
+actions and their social implications. For example, given an action like "Jesse
+saw a concert" and a question like "Why did Jesse do this?", humans can easily
+infer that Jesse wanted "to see their favorite performer" or "to enjoy the music",
+and not "to see what's happening inside" or "to see if it works". The actions in Social IQa
+span a wide variety of social situations, and answer candidates contain both human-curated
+answers and adversarially-filtered machine-generated candidates.
+Social IQa contains over 37,000 QA pairs for evaluating models’ abilities to reason
+about the social implications of everyday events and situations.
+
+Homepage: https://leaderboard.allenai.org/socialiqa/submissions/get-started
+"""
+from lm_eval.base import MultipleChoiceTask
+
+
+_CITATION = """
+@inproceedings{Sap2019SocialIC,
+  title={Social IQA: Commonsense Reasoning about Social Interactions},
+  author={Maarten Sap and Hannah Rashkin and Derek Chen and Ronan Le Bras and Yejin Choi},
+  booktitle={Conference on Empirical Methods in Natural Language Processing},
+  year={2019}
+}
+"""
+
+
+class SIQA(MultipleChoiceTask):
+    VERSION = 0
+    DATASET_PATH = "social_i_qa"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        if self.has_training_docs():
+            if self._training_docs is None:
+                self._training_docs = list(
+                    map(self._process_doc, self.dataset["train"])
+                )
+            return self._training_docs
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return map(self._process_doc, self.dataset["validation"])
+
+    def _process_doc(self, doc):
+        return {
+            "query": f"{doc['context']}\nQuestion: {doc['question']}",
+            "choices": [doc['answerA'], doc['answerB'], doc['answerC']],
+            "gold": int(doc['label']) - 1,  # `-1` because the labels are 1-indexed.
+        }
+
+    def doc_to_text(self, doc):
+        return doc["query"] + "\nAnswer:"