Unverified Commit 302ca3d6 authored by Charles Foster's avatar Charles Foster Committed by GitHub
Browse files

Merge pull request #1 from EleutherAI/master

Bringing up to date with Eleuther repo.
parents aa125d0a b8a3edaf
...@@ -59,30 +59,7 @@ Both LMs (`lm_eval.models`) and Tasks (`lm_eval.tasks`) are kept in a registry d ...@@ -59,30 +59,7 @@ Both LMs (`lm_eval.models`) and Tasks (`lm_eval.tasks`) are kept in a registry d
**If you want to extend either models or tasks, simply add a new LM or Task subclass, and decorate with the registry decorator**. **If you want to extend either models or tasks, simply add a new LM or Task subclass, and decorate with the registry decorator**.
**GLUE** The [GPT-3 Evaluations Project](https://github.com/EleutherAI/lm_evaluation_harness/projects/1) tracks our progress implementing new tasks. Right now, we are focused on getting all the datasets loaded so that we can dedupe against the training data. Implementing the actual evaluations is nice but not necessary at the current moment.
- [X] CoLA
- [X] MNLI
- [X] MRPC
- [X] RTE
- [X] QNLI
- [X] QQP
- [X] STS-B
- [X] SST-2
- [X] WNLI
**SuperGLUE**
- [X] BoolQ
- [X] CommitmentBank
- [X] COPA
- [ ] MultiRC
- [ ] ReCoRD
- [X] RTE (See: GLUE)
- [X] WiC
- [X] WSC
**QA Tasks**
- [ ] CoQA
- [ ] DROP
## Description ## Description
...@@ -122,9 +99,6 @@ With the data downloader in place, we simply need to (1) expose the val/test exa ...@@ -122,9 +99,6 @@ With the data downloader in place, we simply need to (1) expose the val/test exa
### 3. Adding task training data to LM training set ### 3. Adding task training data to LM training set
This part is the easiest. I guess we just write out some text files containing the training data? We can let the usual LM preprocessing pipeline handle it from there. This part is the easiest. I guess we just write out some text files containing the training data? We can let the usual LM preprocessing pipeline handle it from there.
=======
## Summary (need to convert from google docs at some point): ## Summary (need to convert from google docs at some point):
https://docs.google.com/document/d/177dwJpH8GHebISXYZSn4NL98sXdCtQMH82b7O5F7jmw/edit?usp=sharing https://docs.google.com/document/d/177dwJpH8GHebISXYZSn4NL98sXdCtQMH82b7O5F7jmw/edit?usp=sharing
## Current Tasks:
...@@ -4,6 +4,11 @@ from . import arc ...@@ -4,6 +4,11 @@ from . import arc
from . import race from . import race
from . import webqs from . import webqs
from . import anli from . import anli
from . import winogrande
from . import quac
from . import hellaswag
from . import openbookqa
from . import squad
TASK_REGISTRY = { TASK_REGISTRY = {
# GLUE # GLUE
...@@ -26,8 +31,13 @@ TASK_REGISTRY = { ...@@ -26,8 +31,13 @@ TASK_REGISTRY = {
# Order by benchmark/genre? # Order by benchmark/genre?
"arc_easy": arc.ARCEasy, "arc_easy": arc.ARCEasy,
"arc_challenge": arc.ARCChallenge, "arc_challenge": arc.ARCChallenge,
"quac": quac.QuAC,
"hellaswag": hellaswag.HellaSwag,
"openbookqa": openbookqa.OpenBookQA,
"squad": squad.SQuAD,
"race": race.RACE, "race": race.RACE,
"webqs": webqs.WebQs, "webqs": webqs.WebQs,
"winogrande": winogrande.Winogrande,
"anli_r1": anli.ANLIRound1, "anli_r1": anli.ANLIRound1,
"anli_r2": anli.ANLIRound2, "anli_r2": anli.ANLIRound2,
"anli_r3": anli.ANLIRound3, "anli_r3": anli.ANLIRound3,
......
import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno
class HellaSwag(HFTask):
DATASET_PATH = "hellaswag"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
return self.data["train"]
def validation_docs(self):
if self.has_validation_docs():
return self.data["validation"]
def test_docs(self):
if self.has_test_docs():
return self.data["test"]
def fewshot_description(self):
return "Label for the relevant action: Sentences describing the context, with an incomplete sentence trailing\nanswer that plausibly completes the situation."
def doc_to_text(self, doc, include_target=True):
text = doc['activity_label'] + ': ' + doc['ctx'] + '\n'
if include_target:
letter_answer = doc['label']
if letter_answer == '0':
index = 0
elif letter_answer == '1':
index = 1
elif letter_answer == '2':
index = 2
elif letter_answer == '3':
index = 3
else:
raise ValueError("HellaSwag from HF datasets contained an invalid answer key")
text += doc['endings'][index]
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno
class OpenBookQA(HFTask):
DATASET_PATH = "openbookqa"
DATASET_NAME = "main"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.data["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.data["validation"]
def test_docs(self):
if self.has_test_docs():
return self.data["test"]
def fewshot_description(self):
return "Text of the question prompt\nText of the answer completion"
def doc_to_text(self, doc, include_target=True):
text = doc['question_stem'] + '\n'
if include_target:
letter_answer = doc['answerKey']
if letter_answer == 'A':
index = 0
elif letter_answer == 'B':
index = 1
elif letter_answer == 'C':
index = 2
elif letter_answer == 'D':
index = 3
else:
raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
text += doc['choices']['text'][index] + '.'
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
import json
import random
import os
from lm_eval.base import Dataset
from ..utils import sh
class QuAC(Dataset):
def __init__(self):
super().__init__()
def download(self):
if not os.path.exists('data/quac'):
sh("""
mkdir -p data/quac
wget https://s3.amazonaws.com/my89public/quac/train_v0.2.json -O data/quac/train_v0.2.json
wget https://s3.amazonaws.com/my89public/quac/val_v0.2.json -O data/quac/val_v0.2.json
""")
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
myjson = json.load(open('data/quac/train_v0.2.json'))['data']
return self.load_doc(myjson)
def validation_docs(self):
myjson = json.load(open('data/quac/val_v0.2.json'))['data']
return self.load_doc(myjson)
def test_docs(self):
raise NotImplementedError("QuAC has no test docs.")
def fewshot_examples(self, k):
traindocs = list(self.training_docs())
random.shuffle(traindocs)
return traindocs[:k]
def fewshot_description(self):
desc = "TITLE: Title of the context passage - subtitle of the passage\nPARAGRAPH: Passage describing the relevant information for answering questions.\n\nQ: Text of a question.\n\nA: Answer to the question, based on the passage. If it cannot be answered based on the passage, write CANNOTANSWER"
return desc
def load_doc(self, myjson):
docs = []
for item in myjson:
title = item['title'] + ' - ' + item['section_title']
paragraph = item['paragraphs'][0]['context'].replace("CANNOTANSWER", "")
qas = item['paragraphs'][0]['qas']
qa_pairs = [(qa['question'], qa['answers'][0]['text']) for qa in qas]
for (question, answer) in qa_pairs:
doc = { 'title': title, 'paragraph': paragraph, 'question': question, 'answer': answer }
docs.append(doc)
return docs
def doc_to_text(self, doc, include_target=True):
text = 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
if include_target:
text += doc['answer']
return text
def evaluate(self, docs, lm):
pass
from . common import HFTask from . common import HFTask
from ..utils_stream import X, each, apply, join, filt, one from ..utils_stream import X, each, apply, join, filt, one
import collections import collections
import nlp import datasets
class RACE(HFTask): class RACE(HFTask):
...@@ -26,7 +26,7 @@ class RACE(HFTask): ...@@ -26,7 +26,7 @@ class RACE(HFTask):
# is shown that one document is made per passage. # is shown that one document is made per passage.
r = collections.defaultdict(list) r = collections.defaultdict(list)
for item in nlp.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)[set]: for item in datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)[set]:
r[item['article']].append(item) r[item['article']].append(item)
res = list(r.values() >> each(lambda x: { res = list(r.values() >> each(lambda x: {
......
import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno
class SQuAD(HFTask):
DATASET_PATH = "squad_v2"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
if self.has_training_docs():
return self.data["train"]
def validation_docs(self):
if self.has_validation_docs():
return self.data["validation"]
def fewshot_description(self):
return "Title: The_Title_of_It\n\nBackground: A text passage as background to answer the question with.\n\nQ: Question about the passage.\n\nA: Answer."
def doc_to_text(self, doc, include_target=True):
text = 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
if include_target:
answer_list = doc['answers']['text']
if len(answer_list) > 0:
answer = answer_list[0]
else:
answer = 'unanswerable'
text += answer
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
\ No newline at end of file
...@@ -94,7 +94,11 @@ class Copa(HFTask): ...@@ -94,7 +94,11 @@ class Copa(HFTask):
def doc_to_text(self, doc, include_target=True): def doc_to_text(self, doc, include_target=True):
# Drop the period # Drop the period
text = doc["premise"].strip()[:-1] + " because " connector = {
"cause": "because",
"effect": "therefore",
}[doc["question"]]
text = doc["premise"].strip()[:-1] + f" {connector} "
if include_target: if include_target:
correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"] correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
# Connect the sentences # Connect the sentences
......
import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno
class Winogrande(HFTask):
DATASET_PATH = "winogrande"
DATASET_NAME = "winogrande_xl"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
return self.data["train"]
def validation_docs(self):
if self.has_validation_docs():
return self.data["validation"]
def test_docs(self):
if self.has_test_docs():
return self.data["test"]
def fewshot_description(self):
return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."
def doc_to_text(self, doc, include_target=True):
text = doc['sentence']
if include_target:
answer_n = doc['answer']
if answer_n == '1':
answer = doc['option1']
elif answer_n == '2':
answer = doc['option2']
else:
raise ValueError("Winogrande from HF datasets contained an invalid answer key")
text = text.replace("_", answer)
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: Write evaluation function
raise NotImplementedError()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment