Merge branch 'master' into add_lambada

f161731c · sdtblck · GitHub · 5a6c172e · 43978e3b · f161731c
Unverified Commit f161731c authored Oct 05, 2020 by sdtblck Committed by GitHub Oct 05, 2020
13 changed files
--- a/download_all.sh
+++ b/download_all.sh
@@ -3,4 +3,12 @@
 #coqa
 mkdir -p data/coqa
 wget http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json -O data/coqa/coqa-train-v1.0.json
 wget http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-dev-v1.0.json -O data/coqa/coqa-dev-v1.0.json
\ No newline at end of file
+#drop
+mkdir -p data/drop
+wget https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip -O data/drop.zip
+unzip data/drop.zip -d data/drop
+rm data/drop.zip
+mv data/drop/drop_dataset/* data/drop
+rm -rf data/drop/drop_dataset
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -54,6 +54,12 @@ class LM(abc.ABC):
 class Dataset(abc.ABC):
+    @abc.abstractmethod
+    def download(self):
+        """Downloads the task dataset if necessary"""
+        pass
    @abc.abstractmethod
    def has_training_docs(self):
        """Whether the task has a training set"""
@@ -121,4 +127,4 @@ class Dataset(abc.ABC):
            map(self.doc_to_text, self.fewshot_examples(k=num_fewshot))
        ) + "\n\n"
        example = self.doc_to_text(doc, include_target=False).strip()
        return description + labeled_examples + example
\ No newline at end of file
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -17,19 +17,22 @@ class GPT2LM(LM):
        return cls(device=args.get("device", "cpu"))
    def generate(self, context, max_gen_length, truncate=True):
-        context_tensor = torch.tensor([self.tokenizer.encode(context.strip())], dtype=torch.long).to(self.device)
+        # when too long to fit in context, truncate from the left
+        context_tensor = torch.tensor([self.tokenizer.encode(context.strip())[max_gen_length - 1024:]], dtype=torch.long).to(self.device)
        res = self.gpt2.generate(
            context_tensor,
+            # TODO: change to have until rather than using eos_token_id
            eos_token_id=self.tokenizer.eos_token_id,
            do_sample=False,
            max_length=self.num_tokens(context) + max_gen_length,
        )
        # chop off the prompt and the final eos token
-        return self.tokenizer.decode(res[0][len(context[0]):-1]).strip()
+        return self.tokenizer.decode(res[0][min(1024 - max_gen_length, len(context_tensor[0])):-1]).strip()
    def loglikelihood(self, context, continuation, truncate=True):
-        inp = torch.tensor([self.tokenizer.encode(context + continuation)], dtype=torch.long).to(self.device)
+        # when too long to fit in context, truncate from the left
+        inp = torch.tensor([self.tokenizer.encode(context + continuation)[-1024:]], dtype=torch.long).to(self.device)
        ctxlen = len(self.tokenizer.encode(context.strip()))
        cont_toks = inp[:, ctxlen:]  # [batch, seq]

--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
@@ -2,10 +2,8 @@ import os
 import transformers
 from lm_eval.base import LM
 from lm_eval import utils
-from . import MODEL_REGISTRY
-@MODEL_REGISTRY.register("gpt3")
 class GPT3LM(LM):
    MAX_LENGTH = 2048

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
 from . import superglue
 from . import glue
+from . import arc
+from . import race
+from . import webqs
 TASK_REGISTRY = {
+    # GLUE
    "cola": glue.CoLA,
    "mnli": glue.MNLI,
    "mrpc": glue.MRPC,
@@ -11,11 +15,18 @@ TASK_REGISTRY = {
    "stsb": glue.STSB,
    "sst": glue.SST,
    "wnli": glue.WNLI,
+    # SuperGLUE
    "boolq": superglue.BoolQ,
    "commitmentbank": superglue.CommitmentBank,
    "copa": superglue.Copa,
+    "multirc": superglue.MultiRC,
    "wic": superglue.WordsInContext,
    "wsc": superglue.WinogradSchemaChallenge,
+    # Order by benchmark/genre?
+    "arc_easy": arc.ARCEasy,
+    "arc_challenge": arc.ARCChallenge,
+    "race": race.RACE,
+    "webqs": webqs.WebQs,
 }

--- a/lm_eval/tasks/arc.py
+++ b/lm_eval/tasks/arc.py
+from . common import HFNLPTask
+class ARCEasy(HFNLPTask):
+    NLP_PATH = "ai2_arc"
+    NLP_NAME = "ARC-Easy"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+    def doc_to_text(self, doc, include_target=True):
+        q = "Question: " + doc['question'] + '\n'
+        a = "Answer:" + ((" " + doc['choices']['text'][doc['choices']['label'].index(doc['answerKey'])]) if include_target else "")
+        return q + a
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: implement
+        raise NotImplementedError()
+class ARCChallenge(ARCEasy):
+    NLP_PATH = "ai2_arc"
+    NLP_NAME = "ARC-Challenge"
\ No newline at end of file
--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
+import numpy as np
+import json
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+from tqdm import auto as tqdm_lib
+from . common import HFNLPTask, simple_accuracy_metric, yesno
+from pathlib import Path
+from ..base import Dataset
+class DROP(Dataset):
+    DATAFOLDER = Path(__file__).parent / "../../data/drop"
+    def has_training_docs(self):
+        """Whether the task has a training set"""
+        return True
+    def has_validation_docs(self):
+        """Whether the task has a validation set"""
+        return True
+    def has_test_docs(self):
+        """Whether the task has a test set"""
+        return False
+    def training_docs(self):
+        docs = json.load(open(self.DATAFOLDER / 'drop_dataset_train.json'))
+        return [docs[k] for k in docs.keys()]
+    def validation_docs(self):
+        docs = json.load(open(self.DATAFOLDER / 'drop_dataset_dev.json'))
+        return [docs[k] for k in docs.keys()]
+    def test_docs(self):
+        pass
+    def doc_to_text(self, doc, include_target=True):
+        doctext = "Passage: {}\n\n".format(doc["passage"])
+        qa_texts = []
+        for pair in doc["qa_pairs"]:
+            text = ''.join(['Q: ', pair['question'],'\nA: '])
+            if include_target:
+                def get_answer(ans_dict):
+                    if ans_dict['number'] != '':
+                        return ans_dict['number']
+                    if ans_dict['spans'] != []:
+                        if len(ans_dict['spans']) > 0:
+                            return ', '.join(ans_dict['spans'])
+                        return ans_dict['spans'][0]
+                    return ' '.join([ans_dict['date']['day'], 
+                                     ans_dict['date']['month'], 
+                                     ans_dict['date']['year']]).strip() 
+                text = ''.join([text, get_answer(pair['answer'])])
+            qa_texts.append(text)
+        return ''.join([doctext, '\n\n'.join(qa_texts)])
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        """Take iterable of docs and evaluates, returning a dict with the following format:
+        {
+            "major": float,
+            "minor": dict,
+            "higher_is_better": bool,
+        }
+        * `major` should be a single, representative number, for programmatic comparison
+        * `minor` should be a dictionary containing all relevant sub-metrics
+        * `higher_is_better` determines whether a higher metric is better
+        """
+        pass
+    def fewshot_description(self):
+        return "Read the passage and answer the questions "
--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -4,7 +4,6 @@ from sklearn.metrics import f1_score, matthews_corrcoef
 from tqdm import auto as tqdm_lib
 from . common import HFTask, simple_accuracy_metric, yesno
 def get_accuracy_and_f1(preds, golds):
    golds = np.array(golds)
    preds = np.array(preds)
@@ -25,6 +24,15 @@ def get_accuracy_and_f1(preds, golds):
 class CoLA(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "cola"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
    def fewshot_description(self):
        return "Does this sentence make sense?:\tTrue or False?"
@@ -143,7 +151,7 @@ class MRPC(HFTask):
            preds.append(lm.loglikelihood(ctx, 'yes') > lm.loglikelihood(ctx, 'no'))
        return get_accuracy_and_f1(preds=preds, golds=golds)
 class RTE(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "rte"
@@ -353,7 +361,7 @@ class SST(HFTask):
 class WNLI(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "wnli"
    def has_training_docs(self):
        return True

--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
+from . common import HFNLPTask
+from ..utils_stream import X, each, apply, join, filt, one
+import collections
+import nlp
+class RACE(HFNLPTask):
+    NLP_PATH = "race"
+    NLP_NAME = "high"
+    cache = {}
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def _collate_data(self, set):
+        if set in self.cache: return self.cache[set]
+        # One big issue with HF's implementation of this dataset: it makes a
+        # separate document for each question; meanwhile, in the GPT3 paper it
+        # is shown that one document is made per passage.
+        r = collections.defaultdict(list)
+        for item in nlp.load_dataset(path=self.NLP_PATH, name=self.NLP_NAME)[set]:
+            r[item['article']].append(item)
+        res = list(r.values() >> each(lambda x: {
+            'article': x[0]['article'],
+            'problems': x >> each(lambda y: {
+                'question': y['question'],
+                'answer': y['answer'],
+                'options': y['options'],
+            })
+        }))
+        self.cache[set] = res
+        return res
+    def training_docs(self):
+        return self._collate_data("train")
+    def validation_docs(self):
+        return self._collate_data("validation")
+    def test_docs(self):
+        return self._collate_data("test")
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+    def doc_to_text(self, doc, include_target=True):
+        r = "Article:\n" + doc['article'] + '\n\n'
+        r += doc['problems'] >> apply(enumerate) >> each(
+            lambda x: 'Q: ' + x[1]['question'] + '\n\nA:' 
+            + ((' ' + x[1]['options'][['A', 'B', 'C', 'D'].index(x[1]['answer'])]) \
+                if x[0] != len(doc['problems']) - 1 or include_target else '')) \
+            >> join('\n\n')
+        return r
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: implement
+        raise NotImplementedError()
\ No newline at end of file
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
 import numpy as np
 from tqdm import auto as tqdm_lib
-from . common import NLP_TASK, simple_accuracy_metric, yesno
+from . common import HFNLPTask, simple_accuracy_metric, yesno
-class BoolQ(NLP_TASK):
+class BoolQ(HFNLPTask):
    NLP_PATH = "super_glue"
    NLP_NAME = "boolq"
@@ -36,7 +36,7 @@ class BoolQ(NLP_TASK):
        return simple_accuracy_metric(preds=preds, golds=golds)
-class CommitmentBank(NLP_TASK):
+class CommitmentBank(HFNLPTask):
    NLP_PATH = "super_glue"
    NLP_NAME = "cb"
@@ -79,7 +79,7 @@ class CommitmentBank(NLP_TASK):
        return simple_accuracy_metric(preds=preds, golds=golds)
-class Copa(NLP_TASK):
+class Copa(HFNLPTask):
    NLP_PATH = "super_glue"
    NLP_NAME = "copa"
@@ -120,7 +120,64 @@ class Copa(NLP_TASK):
        return choice[0].lower() + choice[1:]
-class WordsInContext(NLP_TASK):
+class MultiRC(HFNLPTask):
+    NLP_PATH = "super_glue"
+    NLP_NAME = "multirc"
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return True
+    def fewshot_description(self):
+        return "READING COMPREHENSION ANSWER KEY"
+    def doc_to_text(self, doc, include_target=True):
+        return f"{doc['paragraph']}\n\n{doc['question']}\n" \
+            + (self.format_answer(answer=doc["answer"], label=doc["label"])
+               if include_target else "")
+    @staticmethod
+    def format_answer(answer, label):
+        label_str = "True" if label else "False"
+        return f"[{label_str}] {answer}"
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        preds = []
+        for doc in docs:
+            ctx = self.fewshot_context(
+                doc=doc,
+                provide_description=provide_description,
+                num_fewshot=num_fewshot,
+            )
+            true_choice = self.format_answer(answer=doc["answer"], label=True)
+            false_choice = self.format_answer(answer=doc["answer"], label=False)
+            preds.append(
+                lm.loglikelihood(ctx, f' {true_choice}')
+                > lm.loglikelihood(ctx, f' {false_choice}')
+            )
+        # Only count as correct if all answers are labeled correctly for each question
+        question_scoring_dict = {}
+        for doc, pred in zip(docs, preds):
+            question_id = doc["idx"]["question"]
+            if question_id not in question_scoring_dict:
+                question_scoring_dict[question_id] = []
+            gold_label = doc["label"] == 1
+            question_scoring_dict[question_id].append(gold_label == pred)
+        acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
+        return {
+            "major": acc,
+            "minor": {"acc": acc},
+            "higher_is_better": True,
+        }
+class WordsInContext(HFNLPTask):
    NLP_PATH = "super_glue"
    NLP_NAME = "wic"
@@ -157,7 +214,7 @@ class WordsInContext(NLP_TASK):
        return simple_accuracy_metric(preds=preds, golds=golds)
-class WinogradSchemaChallenge(NLP_TASK):
+class WinogradSchemaChallenge(HFNLPTask):
    NLP_PATH = "super_glue"
    NLP_NAME = "wsc"

--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
+from . common import HFNLPTask
+class WebQs(HFNLPTask):
+    NLP_PATH = "web_questions"
+    NLP_NAME = None
+    def has_training_docs(self):
+        return True
+    def has_validation_docs(self):
+        return False
+    def has_test_docs(self):
+        return True
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+    def doc_to_text(self, doc, include_target=True):
+        print(doc)
+        q = "Q: " + doc['question'] + '\n'
+        # this picks one answer to be the "correct" one, despite sometimes 
+        # multiple correct answers being possible.
+        # TODO: make sure we're actually handling multi-answer correctly
+        a = "A:" + ((" " + doc['answers'][0]) if include_target else '')
+        return q + a
+    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: implement
+        raise NotImplementedError()
\ No newline at end of file
--- a/lm_eval/utils_stream.py
+++ b/lm_eval/utils_stream.py
+import os
+from functools import reduce
+import operator
+import lm_dataformat as lmd
+from tqdm import tqdm
+import json
+class ExitCodeError(Exception): pass
+def sh(x):
+    if os.system(x): raise ExitCodeError()
+def ls(x):
+    return [x + '/' + fn for fn in os.listdir(x)]
+def lsr(x):
+    if os.path.isdir(x):
+        return reduce(operator.add, map(lsr, ls(x)), [])
+    else:
+        return [x]
+def fwrite(fname, content):
+    with open(fname, 'w') as fh:
+        fh.write(content)
+def fread(fname):
+    with open(fname) as fh:
+        return fh.read()
+class each:
+    def __init__(self, f):
+        self.f = f
+    def __rrshift__(self, other):
+        return list(map(self.f, other))
+class filt:
+    def __init__(self, f):
+        self.f = f
+    def __rrshift__(self, other):
+        return list(filter(self.f, other))
+class apply:
+    def __init__(self, f):
+        self.f = f
+    def __rrshift__(self, other):
+        return self.f(other)
+class one:
+    def __rrshift__(self, other):
+        try:
+            if isinstance(other, list): 
+                assert len(other) == 1
+                return other[0]
+            return next(other)
+        except:
+            return None
+class join:
+    def __init__(self, sep):
+        self.sep = sep
+    def __rrshift__(self, other):
+        if other is None: return
+        try:
+            return self.sep.join(other)
+        except:
+            return None
+Y = object()
+def id(x):
+    return x
+class Reflective:
+    def __getattribute__(self, f):
+        def _fn(*args, **kwargs):
+            return lambda x: x.__getattribute__(f)(*args, **kwargs)
+        return _fn
+    def __getitem__(self, a):
+        return lambda x: x[a]
+    def __mul__(self, other):
+        if other == Y:
+            def _f(x, y=None):
+                if y == None:
+                    x, y = x
+                return x * y
+            return  _f
+        return lambda x: x * other
+    def __rmul__(self, other):
+        if other == Y:
+            def _f(x, y=None):
+                if y == None:
+                    x, y = x
+                return y * x
+            return  _f
+        return lambda x: other * x
+    def __add__(self, other):
+        if other == Y:
+            def _f(x, y=None):
+                if y == None:
+                    x, y = x
+                return x + y
+            return  _f
+        return lambda x: x + other
+    def __radd__(self, other):
+        if other == Y:
+            def _f(x, y=None):
+                if y == None:
+                    x, y = x
+                return y + x
+            return  _f
+        return lambda x: other + x
+# (b -> a -> b) -> b -> [a] -> b
+def foldl(f, init, arr):
+    curr = init
+    for elem in arr:
+        curr = f(curr, elem)
+    return curr
+# (a -> b -> b) -> b -> [a] -> b
+def foldr(f, init, arr):
+    curr = init
+    for elem in arr[::-1]:
+        curr = f(elem, curr)
+    return curr
+def comp(*fs):
+    if len(fs) == 1:
+        return fs[0]
+    def _f(x):
+        for f in fs[::-1]:
+            x = f(x)
+        return x
+    return _f
+X = Reflective()
\ No newline at end of file
--- a/write_out.py
+++ b/write_out.py
@@ -32,8 +32,9 @@ def main():
    os.makedirs(args.output_base_path, exist_ok=True)
    for task_name, task in task_dict.items():
        if not task.has_validation_docs():
-            continue
+            docs = task.training_docs()
-        docs = task.validation_docs()
+        else:
+            docs = task.validation_docs()
        with open(os.path.join(args.output_base_path, task_name), "w") as f:
            for i, doc in zip(range(args.num_examples), docs):
                f.write(EXAMPLE_DIVIDER.format(i=i))