Commit 7a32afeb authored by Leo Gao's avatar Leo Gao
Browse files

Merge branch 'master' of git@github.com:EleutherAI/lm_evaluation_harness.git

parents 1f3b3988 a2871d9c
env
*.pyc
\ No newline at end of file
*.pyc
data/
......@@ -45,3 +45,12 @@ With the data downloader in place, we simply need to (1) expose the val/test exa
### 3. Adding task training data to LM training set
This part is the easiest. I guess we just write out some text files containing the training data? We can let the usual LM preprocessing pipeline handle it from there.
=======
## Summary (need to convert from google docs at some point):
https://docs.google.com/document/d/177dwJpH8GHebISXYZSn4NL98sXdCtQMH82b7O5F7jmw/edit?usp=sharing
## Current Datasets:
[] CoQA
[] DROP
......@@ -12,6 +12,14 @@ class LM(abc.ABC):
class Dataset(abc.ABC):
@abc.abstractmethod
def has_training_docs(self):
pass
@abc.abstractmethod
def has_validation_docs(self):
pass
@abc.abstractmethod
def training_docs(self):
pass
......@@ -42,4 +50,4 @@ class Dataset(abc.ABC):
@abc.abstractmethod
def evaluate(self, docs, lm, provide_description, num_fewshot):
pass
\ No newline at end of file
from ...base import Dataset
from base import Dataset
import os
import json
import random
class CoQA(Dataset):
def has_training_docs(self):
return True
def has_validation_docs(self):
return False
def training_docs(self):
pass
myjson = json.load(open('data/coqa/coqa-train-v1.0.json'))['data']
return self.load_doc(myjson)
def validation_docs(self):
pass
def test_docs(self):
pass
myjson = json.load(open('data/coqa/coqa-dev-v1.0.json'))['data']
return self.load_doc(myjson)
def fewshot_examples(self, k):
traindocs = list(self.training_docs())
......@@ -22,8 +32,22 @@ class CoQA(Dataset):
def fewshot_description(self):
pass
def load_doc(self, myjson):
docs = []
for item in myjson:
new_instance = [item['story']]
qa_pairs = zip(item['questions'], item['answers'])
for pair in qa_pairs:
new_instance.append('\n')
new_instance.append(''.join(['Q: ',pair[0]['input_text']]))
new_instance.append(''.join(['A: ',pair[1]['input_text']]))
docs.append(new_instance)
return docs
def doc_to_text(self, doc, include_target=True):
json.load(open(doc))
text = '\n<|endoftext|>\n'.join(['\n'.join(instance) for instance in doc])
text = text + '\n<|endoftext|>'
return text
def evaluate(self, docs, lm):
pass
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment