Unverified Commit f161731c authored by sdtblck's avatar sdtblck Committed by GitHub
Browse files

Merge branch 'master' into add_lambada

parents 5a6c172e 43978e3b
...@@ -3,4 +3,12 @@ ...@@ -3,4 +3,12 @@
#coqa #coqa
mkdir -p data/coqa mkdir -p data/coqa
wget http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json -O data/coqa/coqa-train-v1.0.json wget http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json -O data/coqa/coqa-train-v1.0.json
wget http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-dev-v1.0.json -O data/coqa/coqa-dev-v1.0.json wget http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-dev-v1.0.json -O data/coqa/coqa-dev-v1.0.json
\ No newline at end of file
#drop
mkdir -p data/drop
wget https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip -O data/drop.zip
unzip data/drop.zip -d data/drop
rm data/drop.zip
mv data/drop/drop_dataset/* data/drop
rm -rf data/drop/drop_dataset
...@@ -54,6 +54,12 @@ class LM(abc.ABC): ...@@ -54,6 +54,12 @@ class LM(abc.ABC):
class Dataset(abc.ABC): class Dataset(abc.ABC):
@abc.abstractmethod
def download(self):
"""Downloads the task dataset if necessary"""
pass
@abc.abstractmethod @abc.abstractmethod
def has_training_docs(self): def has_training_docs(self):
"""Whether the task has a training set""" """Whether the task has a training set"""
...@@ -121,4 +127,4 @@ class Dataset(abc.ABC): ...@@ -121,4 +127,4 @@ class Dataset(abc.ABC):
map(self.doc_to_text, self.fewshot_examples(k=num_fewshot)) map(self.doc_to_text, self.fewshot_examples(k=num_fewshot))
) + "\n\n" ) + "\n\n"
example = self.doc_to_text(doc, include_target=False).strip() example = self.doc_to_text(doc, include_target=False).strip()
return description + labeled_examples + example return description + labeled_examples + example
\ No newline at end of file
...@@ -17,19 +17,22 @@ class GPT2LM(LM): ...@@ -17,19 +17,22 @@ class GPT2LM(LM):
return cls(device=args.get("device", "cpu")) return cls(device=args.get("device", "cpu"))
def generate(self, context, max_gen_length, truncate=True): def generate(self, context, max_gen_length, truncate=True):
context_tensor = torch.tensor([self.tokenizer.encode(context.strip())], dtype=torch.long).to(self.device) # when too long to fit in context, truncate from the left
context_tensor = torch.tensor([self.tokenizer.encode(context.strip())[max_gen_length - 1024:]], dtype=torch.long).to(self.device)
res = self.gpt2.generate( res = self.gpt2.generate(
context_tensor, context_tensor,
# TODO: change to have until rather than using eos_token_id
eos_token_id=self.tokenizer.eos_token_id, eos_token_id=self.tokenizer.eos_token_id,
do_sample=False, do_sample=False,
max_length=self.num_tokens(context) + max_gen_length, max_length=self.num_tokens(context) + max_gen_length,
) )
# chop off the prompt and the final eos token # chop off the prompt and the final eos token
return self.tokenizer.decode(res[0][len(context[0]):-1]).strip() return self.tokenizer.decode(res[0][min(1024 - max_gen_length, len(context_tensor[0])):-1]).strip()
def loglikelihood(self, context, continuation, truncate=True): def loglikelihood(self, context, continuation, truncate=True):
inp = torch.tensor([self.tokenizer.encode(context + continuation)], dtype=torch.long).to(self.device) # when too long to fit in context, truncate from the left
inp = torch.tensor([self.tokenizer.encode(context + continuation)[-1024:]], dtype=torch.long).to(self.device)
ctxlen = len(self.tokenizer.encode(context.strip())) ctxlen = len(self.tokenizer.encode(context.strip()))
cont_toks = inp[:, ctxlen:] # [batch, seq] cont_toks = inp[:, ctxlen:] # [batch, seq]
......
...@@ -2,10 +2,8 @@ import os ...@@ -2,10 +2,8 @@ import os
import transformers import transformers
from lm_eval.base import LM from lm_eval.base import LM
from lm_eval import utils from lm_eval import utils
from . import MODEL_REGISTRY
@MODEL_REGISTRY.register("gpt3")
class GPT3LM(LM): class GPT3LM(LM):
MAX_LENGTH = 2048 MAX_LENGTH = 2048
......
from . import superglue from . import superglue
from . import glue from . import glue
from . import arc
from . import race
from . import webqs
TASK_REGISTRY = { TASK_REGISTRY = {
# GLUE
"cola": glue.CoLA, "cola": glue.CoLA,
"mnli": glue.MNLI, "mnli": glue.MNLI,
"mrpc": glue.MRPC, "mrpc": glue.MRPC,
...@@ -11,11 +15,18 @@ TASK_REGISTRY = { ...@@ -11,11 +15,18 @@ TASK_REGISTRY = {
"stsb": glue.STSB, "stsb": glue.STSB,
"sst": glue.SST, "sst": glue.SST,
"wnli": glue.WNLI, "wnli": glue.WNLI,
# SuperGLUE
"boolq": superglue.BoolQ, "boolq": superglue.BoolQ,
"commitmentbank": superglue.CommitmentBank, "commitmentbank": superglue.CommitmentBank,
"copa": superglue.Copa, "copa": superglue.Copa,
"multirc": superglue.MultiRC,
"wic": superglue.WordsInContext, "wic": superglue.WordsInContext,
"wsc": superglue.WinogradSchemaChallenge, "wsc": superglue.WinogradSchemaChallenge,
# Order by benchmark/genre?
"arc_easy": arc.ARCEasy,
"arc_challenge": arc.ARCChallenge,
"race": race.RACE,
"webqs": webqs.WebQs,
} }
......
from . common import HFNLPTask
class ARCEasy(HFNLPTask):
NLP_PATH = "ai2_arc"
NLP_NAME = "ARC-Easy"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc, include_target=True):
q = "Question: " + doc['question'] + '\n'
a = "Answer:" + ((" " + doc['choices']['text'][doc['choices']['label'].index(doc['answerKey'])]) if include_target else "")
return q + a
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: implement
raise NotImplementedError()
class ARCChallenge(ARCEasy):
NLP_PATH = "ai2_arc"
NLP_NAME = "ARC-Challenge"
\ No newline at end of file
import numpy as np
import json
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFNLPTask, simple_accuracy_metric, yesno
from pathlib import Path
from ..base import Dataset
class DROP(Dataset):
DATAFOLDER = Path(__file__).parent / "../../data/drop"
def has_training_docs(self):
"""Whether the task has a training set"""
return True
def has_validation_docs(self):
"""Whether the task has a validation set"""
return True
def has_test_docs(self):
"""Whether the task has a test set"""
return False
def training_docs(self):
docs = json.load(open(self.DATAFOLDER / 'drop_dataset_train.json'))
return [docs[k] for k in docs.keys()]
def validation_docs(self):
docs = json.load(open(self.DATAFOLDER / 'drop_dataset_dev.json'))
return [docs[k] for k in docs.keys()]
def test_docs(self):
pass
def doc_to_text(self, doc, include_target=True):
doctext = "Passage: {}\n\n".format(doc["passage"])
qa_texts = []
for pair in doc["qa_pairs"]:
text = ''.join(['Q: ', pair['question'],'\nA: '])
if include_target:
def get_answer(ans_dict):
if ans_dict['number'] != '':
return ans_dict['number']
if ans_dict['spans'] != []:
if len(ans_dict['spans']) > 0:
return ', '.join(ans_dict['spans'])
return ans_dict['spans'][0]
return ' '.join([ans_dict['date']['day'],
ans_dict['date']['month'],
ans_dict['date']['year']]).strip()
text = ''.join([text, get_answer(pair['answer'])])
qa_texts.append(text)
return ''.join([doctext, '\n\n'.join(qa_texts)])
def evaluate(self, docs, lm, provide_description, num_fewshot):
"""Take iterable of docs and evaluates, returning a dict with the following format:
{
"major": float,
"minor": dict,
"higher_is_better": bool,
}
* `major` should be a single, representative number, for programmatic comparison
* `minor` should be a dictionary containing all relevant sub-metrics
* `higher_is_better` determines whether a higher metric is better
"""
pass
def fewshot_description(self):
return "Read the passage and answer the questions "
...@@ -4,7 +4,6 @@ from sklearn.metrics import f1_score, matthews_corrcoef ...@@ -4,7 +4,6 @@ from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno from . common import HFTask, simple_accuracy_metric, yesno
def get_accuracy_and_f1(preds, golds): def get_accuracy_and_f1(preds, golds):
golds = np.array(golds) golds = np.array(golds)
preds = np.array(preds) preds = np.array(preds)
...@@ -25,6 +24,15 @@ def get_accuracy_and_f1(preds, golds): ...@@ -25,6 +24,15 @@ def get_accuracy_and_f1(preds, golds):
class CoLA(HFTask): class CoLA(HFTask):
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "cola" DATASET_NAME = "cola"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self): def fewshot_description(self):
return "Does this sentence make sense?:\tTrue or False?" return "Does this sentence make sense?:\tTrue or False?"
...@@ -143,7 +151,7 @@ class MRPC(HFTask): ...@@ -143,7 +151,7 @@ class MRPC(HFTask):
preds.append(lm.loglikelihood(ctx, 'yes') > lm.loglikelihood(ctx, 'no')) preds.append(lm.loglikelihood(ctx, 'yes') > lm.loglikelihood(ctx, 'no'))
return get_accuracy_and_f1(preds=preds, golds=golds) return get_accuracy_and_f1(preds=preds, golds=golds)
class RTE(HFTask): class RTE(HFTask):
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "rte" DATASET_NAME = "rte"
...@@ -353,7 +361,7 @@ class SST(HFTask): ...@@ -353,7 +361,7 @@ class SST(HFTask):
class WNLI(HFTask): class WNLI(HFTask):
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "wnli" DATASET_NAME = "wnli"
def has_training_docs(self): def has_training_docs(self):
return True return True
......
from . common import HFNLPTask
from ..utils_stream import X, each, apply, join, filt, one
import collections
import nlp
class RACE(HFNLPTask):
NLP_PATH = "race"
NLP_NAME = "high"
cache = {}
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def _collate_data(self, set):
if set in self.cache: return self.cache[set]
# One big issue with HF's implementation of this dataset: it makes a
# separate document for each question; meanwhile, in the GPT3 paper it
# is shown that one document is made per passage.
r = collections.defaultdict(list)
for item in nlp.load_dataset(path=self.NLP_PATH, name=self.NLP_NAME)[set]:
r[item['article']].append(item)
res = list(r.values() >> each(lambda x: {
'article': x[0]['article'],
'problems': x >> each(lambda y: {
'question': y['question'],
'answer': y['answer'],
'options': y['options'],
})
}))
self.cache[set] = res
return res
def training_docs(self):
return self._collate_data("train")
def validation_docs(self):
return self._collate_data("validation")
def test_docs(self):
return self._collate_data("test")
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc, include_target=True):
r = "Article:\n" + doc['article'] + '\n\n'
r += doc['problems'] >> apply(enumerate) >> each(
lambda x: 'Q: ' + x[1]['question'] + '\n\nA:'
+ ((' ' + x[1]['options'][['A', 'B', 'C', 'D'].index(x[1]['answer'])]) \
if x[0] != len(doc['problems']) - 1 or include_target else '')) \
>> join('\n\n')
return r
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: implement
raise NotImplementedError()
\ No newline at end of file
import numpy as np import numpy as np
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import NLP_TASK, simple_accuracy_metric, yesno from . common import HFNLPTask, simple_accuracy_metric, yesno
class BoolQ(NLP_TASK): class BoolQ(HFNLPTask):
NLP_PATH = "super_glue" NLP_PATH = "super_glue"
NLP_NAME = "boolq" NLP_NAME = "boolq"
...@@ -36,7 +36,7 @@ class BoolQ(NLP_TASK): ...@@ -36,7 +36,7 @@ class BoolQ(NLP_TASK):
return simple_accuracy_metric(preds=preds, golds=golds) return simple_accuracy_metric(preds=preds, golds=golds)
class CommitmentBank(NLP_TASK): class CommitmentBank(HFNLPTask):
NLP_PATH = "super_glue" NLP_PATH = "super_glue"
NLP_NAME = "cb" NLP_NAME = "cb"
...@@ -79,7 +79,7 @@ class CommitmentBank(NLP_TASK): ...@@ -79,7 +79,7 @@ class CommitmentBank(NLP_TASK):
return simple_accuracy_metric(preds=preds, golds=golds) return simple_accuracy_metric(preds=preds, golds=golds)
class Copa(NLP_TASK): class Copa(HFNLPTask):
NLP_PATH = "super_glue" NLP_PATH = "super_glue"
NLP_NAME = "copa" NLP_NAME = "copa"
...@@ -120,7 +120,64 @@ class Copa(NLP_TASK): ...@@ -120,7 +120,64 @@ class Copa(NLP_TASK):
return choice[0].lower() + choice[1:] return choice[0].lower() + choice[1:]
class WordsInContext(NLP_TASK): class MultiRC(HFNLPTask):
NLP_PATH = "super_glue"
NLP_NAME = "multirc"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "READING COMPREHENSION ANSWER KEY"
def doc_to_text(self, doc, include_target=True):
return f"{doc['paragraph']}\n\n{doc['question']}\n" \
+ (self.format_answer(answer=doc["answer"], label=doc["label"])
if include_target else "")
@staticmethod
def format_answer(answer, label):
label_str = "True" if label else "False"
return f"[{label_str}] {answer}"
def evaluate(self, docs, lm, provide_description, num_fewshot):
preds = []
for doc in docs:
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
true_choice = self.format_answer(answer=doc["answer"], label=True)
false_choice = self.format_answer(answer=doc["answer"], label=False)
preds.append(
lm.loglikelihood(ctx, f' {true_choice}')
> lm.loglikelihood(ctx, f' {false_choice}')
)
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return {
"major": acc,
"minor": {"acc": acc},
"higher_is_better": True,
}
class WordsInContext(HFNLPTask):
NLP_PATH = "super_glue" NLP_PATH = "super_glue"
NLP_NAME = "wic" NLP_NAME = "wic"
...@@ -157,7 +214,7 @@ class WordsInContext(NLP_TASK): ...@@ -157,7 +214,7 @@ class WordsInContext(NLP_TASK):
return simple_accuracy_metric(preds=preds, golds=golds) return simple_accuracy_metric(preds=preds, golds=golds)
class WinogradSchemaChallenge(NLP_TASK): class WinogradSchemaChallenge(HFNLPTask):
NLP_PATH = "super_glue" NLP_PATH = "super_glue"
NLP_NAME = "wsc" NLP_NAME = "wsc"
......
from . common import HFNLPTask
class WebQs(HFNLPTask):
NLP_PATH = "web_questions"
NLP_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc, include_target=True):
print(doc)
q = "Q: " + doc['question'] + '\n'
# this picks one answer to be the "correct" one, despite sometimes
# multiple correct answers being possible.
# TODO: make sure we're actually handling multi-answer correctly
a = "A:" + ((" " + doc['answers'][0]) if include_target else '')
return q + a
def evaluate(self, docs, lm, provide_description, num_fewshot):
# TODO: implement
raise NotImplementedError()
\ No newline at end of file
import os
from functools import reduce
import operator
import lm_dataformat as lmd
from tqdm import tqdm
import json
class ExitCodeError(Exception): pass
def sh(x):
if os.system(x): raise ExitCodeError()
def ls(x):
return [x + '/' + fn for fn in os.listdir(x)]
def lsr(x):
if os.path.isdir(x):
return reduce(operator.add, map(lsr, ls(x)), [])
else:
return [x]
def fwrite(fname, content):
with open(fname, 'w') as fh:
fh.write(content)
def fread(fname):
with open(fname) as fh:
return fh.read()
class each:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return list(map(self.f, other))
class filt:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return list(filter(self.f, other))
class apply:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return self.f(other)
class one:
def __rrshift__(self, other):
try:
if isinstance(other, list):
assert len(other) == 1
return other[0]
return next(other)
except:
return None
class join:
def __init__(self, sep):
self.sep = sep
def __rrshift__(self, other):
if other is None: return
try:
return self.sep.join(other)
except:
return None
Y = object()
def id(x):
return x
class Reflective:
def __getattribute__(self, f):
def _fn(*args, **kwargs):
return lambda x: x.__getattribute__(f)(*args, **kwargs)
return _fn
def __getitem__(self, a):
return lambda x: x[a]
def __mul__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return x * y
return _f
return lambda x: x * other
def __rmul__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return y * x
return _f
return lambda x: other * x
def __add__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return x + y
return _f
return lambda x: x + other
def __radd__(self, other):
if other == Y:
def _f(x, y=None):
if y == None:
x, y = x
return y + x
return _f
return lambda x: other + x
# (b -> a -> b) -> b -> [a] -> b
def foldl(f, init, arr):
curr = init
for elem in arr:
curr = f(curr, elem)
return curr
# (a -> b -> b) -> b -> [a] -> b
def foldr(f, init, arr):
curr = init
for elem in arr[::-1]:
curr = f(elem, curr)
return curr
def comp(*fs):
if len(fs) == 1:
return fs[0]
def _f(x):
for f in fs[::-1]:
x = f(x)
return x
return _f
X = Reflective()
\ No newline at end of file
...@@ -32,8 +32,9 @@ def main(): ...@@ -32,8 +32,9 @@ def main():
os.makedirs(args.output_base_path, exist_ok=True) os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
if not task.has_validation_docs(): if not task.has_validation_docs():
continue docs = task.training_docs()
docs = task.validation_docs() else:
docs = task.validation_docs()
with open(os.path.join(args.output_base_path, task_name), "w") as f: with open(os.path.join(args.output_base_path, task_name), "w") as f:
for i, doc in zip(range(args.num_examples), docs): for i, doc in zip(range(args.num_examples), docs):
f.write(EXAMPLE_DIVIDER.format(i=i)) f.write(EXAMPLE_DIVIDER.format(i=i))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment