Merge branch 'master' into winogrande

0e0e37f4 · Stella Biderman · GitHub · dbe43ec3 · 80f5fc3b · 0e0e37f4
Unverified Commit 0e0e37f4 authored Oct 23, 2020 by Stella Biderman Committed by GitHub Oct 23, 2020
7 changed files
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -5,6 +5,10 @@ from . import race
 from . import webqs
 from . import anli
 from . import winogrande
+from . import quac
+from . import hellaswag
+from . import openbookqa
+from . import squad

 TASK_REGISTRY = {
    # GLUE
@@ -27,6 +31,10 @@ TASK_REGISTRY = {
    # Order by benchmark/genre?
    "arc_easy": arc.ARCEasy,
    "arc_challenge": arc.ARCChallenge,
+    "quac": quac.QuAC,
+    "hellaswag": hellaswag.HellaSwag,
+    "openbookqa": openbookqa.OpenBookQA,
+    "squad": squad.SQuAD,
    "race": race.RACE,
    "webqs": webqs.WebQs,
    "winogrande": winogrande.Winogrande,

--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import HFTask, simple_accuracy_metric, yesno
+
+class HellaSwag(HFTask):
+    DATASET_PATH = "hellaswag"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.data["train"]
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.data["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.data["test"]
+
+    def fewshot_description(self):
+        return "Label for the relevant action: Sentences describing the context, with an incomplete sentence trailing\nanswer that plausibly completes the situation."
+
+    def doc_to_text(self, doc, include_target=True):
+        text = doc['activity_label'] + ': ' + doc['ctx'] + '\n'
+        if include_target:
+            letter_answer = doc['label']
+            if letter_answer == '0':
+                index = 0
+            elif letter_answer == '1':
+                index = 1
+            elif letter_answer == '2':
+                index = 2
+            elif letter_answer == '3':
+                index = 3
+            else:
+                raise ValueError("HellaSwag from HF datasets contained an invalid answer key")
+            text += doc['endings'][index]
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Write evaluation function
+        raise NotImplementedError()
--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import HFTask, simple_accuracy_metric, yesno
+
+class OpenBookQA(HFTask):
+    DATASET_PATH = "openbookqa"
+    DATASET_NAME = "main"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():
+            if self._training_docs is None:
+                self._training_docs = list(self.data["train"])
+            return self._training_docs
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.data["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.data["test"]
+
+    def fewshot_description(self):
+        return "Text of the question prompt\nText of the answer completion"
+
+    def doc_to_text(self, doc, include_target=True):
+        text = doc['question_stem'] + '\n'
+        if include_target:
+            letter_answer = doc['answerKey']
+            if letter_answer == 'A':
+                index = 0
+            elif letter_answer == 'B':
+                index = 1
+            elif letter_answer == 'C':
+                index = 2
+            elif letter_answer == 'D':
+                index = 3
+            else:
+                raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
+            text += doc['choices']['text'][index] + '.'
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Write evaluation function
+        raise NotImplementedError()
--- a/lm_eval/tasks/quac.py
+++ b/lm_eval/tasks/quac.py
+import json
+import random
+import os
+from lm_eval.base import Dataset
+from ..utils import sh
+
+
+class QuAC(Dataset):    
+    def __init__(self):
+        super().__init__()
+
+    def download(self):
+        if not os.path.exists('data/quac'):
+            sh("""
+                mkdir -p data/quac 
+                wget https://s3.amazonaws.com/my89public/quac/train_v0.2.json -O data/quac/train_v0.2.json
+                wget https://s3.amazonaws.com/my89public/quac/val_v0.2.json -O data/quac/val_v0.2.json
+                """)
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        myjson = json.load(open('data/quac/train_v0.2.json'))['data']
+        return self.load_doc(myjson)
+
+    def validation_docs(self):
+        myjson = json.load(open('data/quac/val_v0.2.json'))['data']    
+        return self.load_doc(myjson)
+
+    def test_docs(self):
+        raise NotImplementedError("QuAC has no test docs.")
+    
+    def fewshot_examples(self, k):
+        traindocs = list(self.training_docs())
+        random.shuffle(traindocs)
+
+        return traindocs[:k]
+    
+    def fewshot_description(self):
+        desc = "TITLE: Title of the context passage - subtitle of the passage\nPARAGRAPH: Passage describing the relevant information for answering questions.\n\nQ: Text of a question.\n\nA: Answer to the question, based on the passage. If it cannot be answered based on the passage, write CANNOTANSWER"
+        return desc
+
+    def load_doc(self, myjson):
+        docs = []
+        for item in myjson:
+            title = item['title'] + ' - ' + item['section_title']
+            paragraph = item['paragraphs'][0]['context'].replace("CANNOTANSWER", "")
+            qas = item['paragraphs'][0]['qas']
+            qa_pairs = [(qa['question'], qa['answers'][0]['text']) for qa in qas]
+            for (question, answer) in qa_pairs:
+                doc = { 'title': title, 'paragraph': paragraph, 'question': question, 'answer': answer }
+                docs.append(doc)  
+        return docs
+    
+    def doc_to_text(self, doc, include_target=True):
+        text = 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
+        if include_target:
+            text += doc['answer']
+        return text
+
+    def evaluate(self, docs, lm):
+        pass
--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
 from . common import HFTask
 from ..utils_stream import X, each, apply, join, filt, one
 import collections
-import nlp
+import datasets


 class RACE(HFTask):
@@ -26,7 +26,7 @@ class RACE(HFTask):
        # is shown that one document is made per passage.

        r = collections.defaultdict(list)
-        for item in nlp.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)[set]:
+        for item in datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)[set]:
            r[item['article']].append(item)
        
        res = list(r.values() >> each(lambda x: {

--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import HFTask, simple_accuracy_metric, yesno
+
+class SQuAD(HFTask):
+    DATASET_PATH = "squad_v2"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.data["train"]
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.data["validation"]
+
+    def fewshot_description(self):
+        return "Title: The_Title_of_It\n\nBackground: A text passage as background to answer the question with.\n\nQ: Question about the passage.\n\nA: Answer."
+
+    def doc_to_text(self, doc, include_target=True):
+        text = 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
+        if include_target:
+            answer_list = doc['answers']['text']
+            if len(answer_list) > 0:
+                answer = answer_list[0]
+            else:
+                answer = 'unanswerable'
+            text += answer
+        return text
+
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Write evaluation function
+        raise NotImplementedError()
\ No newline at end of file
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -94,7 +94,11 @@ class Copa(HFTask):

    def doc_to_text(self, doc, include_target=True):
        # Drop the period
-        text = doc["premise"].strip()[:-1] + " because "
+        connector = {
+            "cause": "because",
+            "effect": "therefore",
+        }[doc["question"]]
+        text = doc["premise"].strip()[:-1] + f" {connector} "
        if include_target:
            correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
            # Connect the sentences