Merge pull request #1 from EleutherAI/master

Bringing up to date with Eleuther repo.

Merge pull request #1 from EleutherAI/master
Bringing up to date with Eleuther repo.
302ca3d6 · Charles Foster · GitHub · aa125d0a · b8a3edaf · 302ca3d6
Unverified Commit 302ca3d6 authored Oct 22, 2020 by Charles Foster Committed by GitHub Oct 22, 2020
9 changed files
--- a/README.md
+++ b/README.md
@@ -59,30 +59,7 @@ Both LMs (`lm_eval.models`) and Tasks (`lm_eval.tasks`) are kept in a registry d
 **If you want to extend either models or tasks, simply add a new LM or Task subclass, and decorate with the registry decorator**.
-**GLUE**
+The [GPT-3 Evaluations Project](https://github.com/EleutherAI/lm_evaluation_harness/projects/1) tracks our progress implementing new tasks. Right now, we are focused on getting all the datasets loaded so that we can dedupe against the training data. Implementing the actual evaluations is nice but not necessary at the current moment.
- [X] CoLA
- [X] MNLI
- [X] MRPC
- [X] RTE
- [X] QNLI
- [X] QQP
- [X] STS-B
- [X] SST-2
- [X] WNLI
-**SuperGLUE**
- [X] BoolQ
- [X] CommitmentBank
- [X] COPA
- [ ] MultiRC
- [ ] ReCoRD
- [X] RTE (See: GLUE)
- [X] WiC
- [X] WSC
-**QA Tasks**
- [ ] CoQA 
- [ ] DROP
 ## Description
@@ -122,9 +99,6 @@ With the data downloader in place, we simply need to (1) expose the val/test exa
 ### 3. Adding task training data to LM training set
 This part is the easiest. I guess we just write out some text files containing the training data? We can let the usual LM preprocessing pipeline handle it from there.
-=======
 ## Summary (need to convert from google docs at some point):
 https://docs.google.com/document/d/177dwJpH8GHebISXYZSn4NL98sXdCtQMH82b7O5F7jmw/edit?usp=sharing
-## Current Tasks:
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -4,6 +4,11 @@ from . import arc
 from . import race
 from . import webqs
 from . import anli
+from . import winogrande
+from . import quac
+from . import hellaswag
+from . import openbookqa
+from . import squad
 TASK_REGISTRY = {
    # GLUE
@@ -26,8 +31,13 @@ TASK_REGISTRY = {
    # Order by benchmark/genre?
    "arc_easy": arc.ARCEasy,
    "arc_challenge": arc.ARCChallenge,
+    "quac": quac.QuAC,
+    "hellaswag": hellaswag.HellaSwag,
+    "openbookqa": openbookqa.OpenBookQA,
+    "squad": squad.SQuAD,
    "race": race.RACE,
    "webqs": webqs.WebQs,
+    "winogrande": winogrande.Winogrande,
    "anli_r1": anli.ANLIRound1,
    "anli_r2": anli.ANLIRound2,
    "anli_r3": anli.ANLIRound3,

--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import HFTask, simple_accuracy_metric, yesno
+class HellaSwag(HFTask):
+    DATASET_PATH = "hellaswag"
+    DATASET_NAME = None
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.data["train"]
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.data["validation"]
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.data["test"]
+    def fewshot_description(self):
+        return "Label for the relevant action: Sentences describing the context, with an incomplete sentence trailing\nanswer that plausibly completes the situation."
+    def doc_to_text(self, doc, include_target=True):
+        text = doc['activity_label'] + ': ' + doc['ctx'] + '\n'
+        if include_target:
+            letter_answer = doc['label']
+            if letter_answer == '0':
+                index = 0
+            elif letter_answer == '1':
+                index = 1
+            elif letter_answer == '2':
+                index = 2
+            elif letter_answer == '3':
+                index = 3
+            else:
+                raise ValueError("HellaSwag from HF datasets contained an invalid answer key")
+            text += doc['endings'][index]
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Write evaluation function
+        raise NotImplementedError()
--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import HFTask, simple_accuracy_metric, yesno
+class OpenBookQA(HFTask):
+    DATASET_PATH = "openbookqa"
+    DATASET_NAME = "main"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        if self.has_training_docs():
+            if self._training_docs is None:
+                self._training_docs = list(self.data["train"])
+            return self._training_docs
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.data["validation"]
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.data["test"]
+    def fewshot_description(self):
+        return "Text of the question prompt\nText of the answer completion"
+    def doc_to_text(self, doc, include_target=True):
+        text = doc['question_stem'] + '\n'
+        if include_target:
+            letter_answer = doc['answerKey']
+            if letter_answer == 'A':
+                index = 0
+            elif letter_answer == 'B':
+                index = 1
+            elif letter_answer == 'C':
+                index = 2
+            elif letter_answer == 'D':
+                index = 3
+            else:
+                raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
+            text += doc['choices']['text'][index] + '.'
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Write evaluation function
+        raise NotImplementedError()
--- a/lm_eval/tasks/quac.py
+++ b/lm_eval/tasks/quac.py
+import json
+import random
+import os
+from lm_eval.base import Dataset
+from ..utils import sh
+class QuAC(Dataset):    
+    def __init__(self):
+        super().__init__()
+    def download(self):
+        if not os.path.exists('data/quac'):
+            sh("""
+                mkdir -p data/quac 
+                wget https://s3.amazonaws.com/my89public/quac/train_v0.2.json -O data/quac/train_v0.2.json
+                wget https://s3.amazonaws.com/my89public/quac/val_v0.2.json -O data/quac/val_v0.2.json
+                """)
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        myjson = json.load(open('data/quac/train_v0.2.json'))['data']
+        return self.load_doc(myjson)
+    def validation_docs(self):
+        myjson = json.load(open('data/quac/val_v0.2.json'))['data']    
+        return self.load_doc(myjson)
+    def test_docs(self):
+        raise NotImplementedError("QuAC has no test docs.")
+    def fewshot_examples(self, k):
+        traindocs = list(self.training_docs())
+        random.shuffle(traindocs)
+        return traindocs[:k]
+    def fewshot_description(self):
+        desc = "TITLE: Title of the context passage - subtitle of the passage\nPARAGRAPH: Passage describing the relevant information for answering questions.\n\nQ: Text of a question.\n\nA: Answer to the question, based on the passage. If it cannot be answered based on the passage, write CANNOTANSWER"
+        return desc
+    def load_doc(self, myjson):
+        docs = []
+        for item in myjson:
+            title = item['title'] + ' - ' + item['section_title']
+            paragraph = item['paragraphs'][0]['context'].replace("CANNOTANSWER", "")
+            qas = item['paragraphs'][0]['qas']
+            qa_pairs = [(qa['question'], qa['answers'][0]['text']) for qa in qas]
+            for (question, answer) in qa_pairs:
+                doc = { 'title': title, 'paragraph': paragraph, 'question': question, 'answer': answer }
+                docs.append(doc)  
+        return docs
+    def doc_to_text(self, doc, include_target=True):
+        text = 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
+        if include_target:
+            text += doc['answer']
+        return text
+    def evaluate(self, docs, lm):
+        pass
--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
 from . common import HFTask
 from ..utils_stream import X, each, apply, join, filt, one
 import collections
-import nlp
+import datasets
 class RACE(HFTask):
@@ -26,7 +26,7 @@ class RACE(HFTask):
        # is shown that one document is made per passage.
        r = collections.defaultdict(list)
-        for item in nlp.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)[set]:
+        for item in datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)[set]:
            r[item['article']].append(item)
        res = list(r.values() >> each(lambda x: {

--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import HFTask, simple_accuracy_metric, yesno
+class SQuAD(HFTask):
+    DATASET_PATH = "squad_v2"
+    DATASET_NAME = None
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.data["train"]
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.data["validation"]
+    def fewshot_description(self):
+        return "Title: The_Title_of_It\n\nBackground: A text passage as background to answer the question with.\n\nQ: Question about the passage.\n\nA: Answer."
+    def doc_to_text(self, doc, include_target=True):
+        text = 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
+        if include_target:
+            answer_list = doc['answers']['text']
+            if len(answer_list) > 0:
+                answer = answer_list[0]
+            else:
+                answer = 'unanswerable'
+            text += answer
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Write evaluation function
+        raise NotImplementedError()
\ No newline at end of file
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -94,7 +94,11 @@ class Copa(HFTask):
    def doc_to_text(self, doc, include_target=True):
        # Drop the period
-        text = doc["premise"].strip()[:-1] + " because "
+        connector = {
+            "cause": "because",
+            "effect": "therefore",
+        }[doc["question"]]
+        text = doc["premise"].strip()[:-1] + f" {connector} "
        if include_target:
            correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
            # Connect the sentences

--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import HFTask, simple_accuracy_metric, yesno
+class Winogrande(HFTask):
+    DATASET_PATH = "winogrande"
+    DATASET_NAME = "winogrande_xl"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.data["train"]
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.data["validation"]
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.data["test"]
+    def fewshot_description(self):
+        return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."
+    def doc_to_text(self, doc, include_target=True):
+        text = doc['sentence']
+        if include_target:
+            answer_n = doc['answer']
+            if answer_n == '1':
+                answer = doc['option1']
+            elif answer_n == '2':
+                answer = doc['option2']
+            else:
+                raise ValueError("Winogrande from HF datasets contained an invalid answer key")
+            text = text.replace("_", answer)
+        return text
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Write evaluation function
+        raise NotImplementedError()
\ No newline at end of file