Merge branch 'master' into quac

a543cc5d · Stella Biderman · GitHub · 9017bb36 · 2b64cae6 · a543cc5d
Unverified Commit a543cc5d authored Oct 23, 2020 by Stella Biderman Committed by GitHub Oct 23, 2020
Showing with 114 additions and 0 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +4 -0

lm_eval/tasks/hellaswag.py lm_eval/tasks/hellaswag.py +54 -0

lm_eval/tasks/openbookqa.py lm_eval/tasks/openbookqa.py +56 -0

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -5,6 +5,8 @@ from . import race
 from . import webqs
 from . import anli
 from . import quac
+from . import hellaswag
+from . import openbookqa
 from . import squad

 TASK_REGISTRY = {
@@ -29,6 +31,8 @@ TASK_REGISTRY = {
    "arc_easy": arc.ARCEasy,
    "arc_challenge": arc.ARCChallenge,
    "quac": quac.QuAC,
+    "hellaswag": hellaswag.HellaSwag,
+    "openbookqa": openbookqa.OpenBookQA,
    "squad": squad.SQuAD,
    "race": race.RACE,
    "webqs": webqs.WebQs,

--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import HFTask, simple_accuracy_metric, yesno
+
+class HellaSwag(HFTask):
+    DATASET_PATH = "hellaswag"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.data["train"]
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.data["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.data["test"]
+
+    def fewshot_description(self):
+        return "Label for the relevant action: Sentences describing the context, with an incomplete sentence trailing\nanswer that plausibly completes the situation."
+
+    def doc_to_text(self, doc, include_target=True):
+        text = doc['activity_label'] + ': ' + doc['ctx'] + '\n'
+        if include_target:
+            letter_answer = doc['label']
+            if letter_answer == '0':
+                index = 0
+            elif letter_answer == '1':
+                index = 1
+            elif letter_answer == '2':
+                index = 2
+            elif letter_answer == '3':
+                index = 3
+            else:
+                raise ValueError("HellaSwag from HF datasets contained an invalid answer key")
+            text += doc['endings'][index]
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Write evaluation function
+        raise NotImplementedError()
--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import HFTask, simple_accuracy_metric, yesno
+
+class OpenBookQA(HFTask):
+    DATASET_PATH = "openbookqa"
+    DATASET_NAME = "main"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():
+            if self._training_docs is None:
+                self._training_docs = list(self.data["train"])
+            return self._training_docs
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.data["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.data["test"]
+
+    def fewshot_description(self):
+        return "Text of the question prompt\nText of the answer completion"
+
+    def doc_to_text(self, doc, include_target=True):
+        text = doc['question_stem'] + '\n'
+        if include_target:
+            letter_answer = doc['answerKey']
+            if letter_answer == 'A':
+                index = 0
+            elif letter_answer == 'B':
+                index = 1
+            elif letter_answer == 'C':
+                index = 2
+            elif letter_answer == 'D':
+                index = 3
+            else:
+                raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
+            text += doc['choices']['text'][index] + '.'
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Write evaluation function
+        raise NotImplementedError()