Merge branch 'master' into researcher2

e0cfeb90 · Jonathan Tow · GitHub · f9b81151 · 6caa0afd · e0cfeb90
Unverified Commit e0cfeb90 authored Apr 10, 2022 by Jonathan Tow Committed by GitHub Apr 10, 2022
20 changed files
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -50,6 +50,7 @@ from . import truthfulqa
 from . import blimp
 from . import asdiv
 from . import gsm8k
+from . import storycloze

 ########################################
 # Translation tasks
@@ -136,7 +137,6 @@ TASK_REGISTRY = {
    "logiqa": logiqa.LogiQA,
    "hellaswag": hellaswag.HellaSwag,
    "openbookqa": openbookqa.OpenBookQA,
-    # "sat": sat.SATAnalogies, # not implemented yet
    "squad2": squad.SQuAD2,
    "race": race.RACE,
    # "naturalqs": naturalqs.NaturalQs, # not implemented yet
@@ -297,6 +297,11 @@ TASK_REGISTRY = {
    "blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance,
    "blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
    "blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
+
+    # Requires manual download of data.
+    # "storycloze_2016": storycloze.StoryCloze2016,
+    # "storycloze_2018": storycloze.StoryCloze2018,
+    # "sat": sat.SATAnalogies,
 }



--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
-import numpy as np
-from lm_eval.base import rf
-from ..metrics import mean
-from . common import HFTask
+"""
+Adversarial NLI: A New Benchmark for Natural Language Understanding
+https://arxiv.org/pdf/1910.14599.pdf

+Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial
+human-and-model-in-the-loop procedure. It consists of three rounds that progressively
+increase in difficulty and complexity, and each question-answer includes annotator-
+provided explanations.

-class ANLIBase(HFTask):
+Homepage: "https://github.com/facebookresearch/anli"
+"""
+import numpy as np
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@inproceedings{nie-etal-2020-adversarial,
+    title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding",
+    author = "Nie, Yixin  and
+      Williams, Adina  and
+      Dinan, Emily  and
+      Bansal, Mohit  and
+      Weston, Jason  and
+      Kiela, Douwe",
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+}
+"""
+
+
+class ANLIBase(Task):
    VERSION = 0
    DATASET_PATH = "anli"
    DATASET_NAME = None
@@ -22,16 +48,16 @@ class ANLIBase(HFTask):
    def training_docs(self):
        if self.has_training_docs():
            if self._training_docs is None:
-                self._training_docs = list(self.data["train_r" + str(self.SPLIT)])
+                self._training_docs = list(self.dataset["train_r" + str(self.SPLIT)])
            return self._training_docs

    def validation_docs(self):
        if self.has_validation_docs():
-            return self.data["dev_r" + str(self.SPLIT)]
+            return self.dataset["dev_r" + str(self.SPLIT)]

    def test_docs(self):
        if self.has_test_docs():
-            return self.data["test_r" + str(self.SPLIT)]
+            return self.dataset["test_r" + str(self.SPLIT)]

    def doc_to_text(self, doc):
        # OA does this a bit weirdly: they prepend "anli 1:  anli 1:  " to the beginning
@@ -104,11 +130,14 @@ class ANLIBase(HFTask):
            "acc": True
        }

+
 class ANLIRound1(ANLIBase):
    SPLIT = 1

+
 class ANLIRound2(ANLIBase):
    SPLIT = 2

+
 class ANLIRound3(ANLIBase):
    SPLIT = 3
--- a/lm_eval/tasks/arc.py
+++ b/lm_eval/tasks/arc.py
+"""
+Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge
+https://arxiv.org/pdf/1803.05457.pdf
+
+The ARC dataset consists of 7,787 science exam questions drawn from a variety
+of sources, including science questions provided under license by a research
+partner affiliated with AI2. These are text-only, English language exam questions
+that span several grade levels as indicated in the files. Each question has a
+multiple choice structure (typically 4 answer options). The questions are sorted
+into a Challenge Set of 2,590 “hard” questions (those that both a retrieval and
+a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questions.
+
+Homepage: https://allenai.org/data/arc
+"""
 from lm_eval.base import MultipleChoiceTask
-from . common import HFTask


-class ARCEasy(HFTask, MultipleChoiceTask):
+_CITATION = """
+@article{Clark2018ThinkYH,
+  title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
+  author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
+  journal={ArXiv},
+  year={2018},
+  volume={abs/1803.05457}
+}
+"""
+
+
+class ARCEasy(MultipleChoiceTask):
    VERSION = 0
    DATASET_PATH = "ai2_arc"
    DATASET_NAME = "ARC-Easy"
@@ -16,7 +40,18 @@ class ARCEasy(HFTask, MultipleChoiceTask):
    def has_test_docs(self):
        return True

-    def _convert_standard(self, doc):
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
+        return self._training_docs
+
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+
+    def test_docs(self):
+        return map(self._process_doc, self.dataset["test"])
+
+    def _process_doc(self, doc):
        # NOTE: Some `doc["answerKey"]`s are in numeric string format being one
        # of {'1', '2', '3', '4', '5'}. We map them back to letters.
        num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"}

--- a/lm_eval/tasks/arithmetic.py
+++ b/lm_eval/tasks/arithmetic.py
-import abc
-import json
-import os
-from collections import namedtuple
+"""
+Language Models are Few-Shot Learners
+https://arxiv.org/pdf/2005.14165.pdf
+
+A small battery of 10 tests that involve asking language models a simple arithmetic
+problem in natural language.
+
+Homepage: https://github.com/openai/gpt-3/tree/master/data
+"""
+import inspect
+import lm_eval.datasets.arithmetic.arithmetic
 from lm_eval.base import Task, rf
 from lm_eval.metrics import mean
-from best_download import download_file

-ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
+
+_CITATION = """
+@inproceedings{NEURIPS2020_1457c0d6,
+    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
+    booktitle = {Advances in Neural Information Processing Systems},
+    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
+    pages = {1877--1901},
+    publisher = {Curran Associates, Inc.},
+    title = {Language Models are Few-Shot Learners},
+    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
+    volume = {33},
+    year = {2020}
+}
+"""


 class Arithmetic(Task):
    VERSION = 0
-    directory = 'data/arithmetic/'
-
-    def __init__(self):
-        super().__init__()
-
-    def download(self):
-        file_name, checksum = self.get_file_download_info()
-        url = 'https://raw.githubusercontent.com/openai/gpt-3/master/data/' + file_name
-        if not os.path.exists(self.directory):
-            os.makedirs(self.directory)
-        download_file(url, local_file=self.directory+file_name, expected_checksum=checksum)
-        self.set_docs()
-
-    @abc.abstractmethod
-    def get_file_download_info(self):
-        """returns a tuple of (file_name, checksum)"""
-        pass
-
-    def set_docs(self):
-        file_name, _ = self.get_file_download_info()
-        jsons = open(self.directory+file_name, 'r')
-        self._docs = [self.load_doc(json.loads(line)) for line in jsons]
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.arithmetic.arithmetic)

    def has_training_docs(self):
        return False
@@ -47,13 +45,13 @@ class Arithmetic(Task):
        return NotImplemented

    def validation_docs(self):
-        return self._docs
+        return self.dataset["validation"]

    def test_docs(self):
        return NotImplemented
    
    def doc_to_text(self, doc):
-        return doc.context
+        return doc["context"]

    def should_decontaminate(self):
        return True
@@ -62,16 +60,10 @@ class Arithmetic(Task):
        return doc.context

    def doc_to_target(self, doc):
-        return doc.completion
+        return doc["completion"]

-    def load_doc(self, doc_json):
-        return ArithmeticDoc(context=doc_json['context'].strip()
-            .replace('\n\n', '\n')
-            .replace('Q:', 'Question:')
-            .replace('A:', 'Answer:'), completion=doc_json['completion'])
-    
    def construct_requests(self, doc, ctx):
-        ll, is_prediction = rf.loglikelihood(ctx, doc.completion)
+        ll, is_prediction = rf.loglikelihood(ctx, doc["completion"])
        return is_prediction

    def process_results(self, doc, results):
@@ -92,41 +84,40 @@ class Arithmetic(Task):


 class Arithmetic2DPlus(Arithmetic):
-    def get_file_download_info(self):
-        return 'two_digit_addition.jsonl', '75a54b7a3db3b23369df74fe440c23025f3d3c51f664300bd3d56632b2617b3d'
+    DATASET_NAME = "arithmetic_2da"
+

 class Arithmetic2DMinus(Arithmetic):
-    def get_file_download_info(self):
-        return 'two_digit_subtraction.jsonl', 'da956066ff108c00b341d360567472784f5fd872d6465071b44a14291205bc03'
+    DATASET_NAME = "arithmetic_2ds"
+

 class Arithmetic3DPlus(Arithmetic):
-    def get_file_download_info(self):
-        return 'three_digit_addition.jsonl', '124865e30efd2abfbc1855dd34c218fc02d32d780ace970ab9b4ea3fa74c798b'
+    DATASET_NAME = "arithmetic_3da"
+

 class Arithmetic3DMinus(Arithmetic):
-    def get_file_download_info(self):
-        return 'three_digit_subtraction.jsonl', '7fc6aaedcb0e2bd17c398dd4147c5585b1e608278a8e98b914e69656707d6a29'
+    DATASET_NAME = "arithmetic_3ds"
+

 class Arithmetic4DPlus(Arithmetic):
-    def get_file_download_info(self):
-        return 'four_digit_addition.jsonl', '459c6f75baa2e8d7cf50bdd07db6d0ca9133a6b137d95d09267db85b6e07f391'
+    DATASET_NAME = "arithmetic_4da"
+

 class Arithmetic4DMinus(Arithmetic):
-    def get_file_download_info(self):
-        return 'four_digit_subtraction.jsonl', '0c47db40a10c052ef0cf732a9ef2edaa53d66377d43eb47a9c382d33a8af7102'
+    DATASET_NAME = "arithmetic_4ds"
+

 class Arithmetic5DPlus(Arithmetic):
-    def get_file_download_info(self):
-        return 'five_digit_addition.jsonl', '30ada42efe315b958c6e9649274005d3b720e50298e92c3a2d321f8996e58f54'
+    DATASET_NAME = "arithmetic_5da"
+

 class Arithmetic5DMinus(Arithmetic):
-    def get_file_download_info(self):
-        return 'five_digit_subtraction.jsonl', '8b98ccfc943cbf9193bcf1984954aa0b1a4527016072d972a2b055cc1482ca3c'
+    DATASET_NAME = "arithmetic_5ds"
+

 class Arithmetic2DMultiplication(Arithmetic):
-    def get_file_download_info(self):
-        return 'two_digit_multiplication.jsonl', '5613d1d1cc3b2c03edc1990252247d34c10ec82944b2cdeb19e71b00f237f431'
+    DATASET_NAME = "arithmetic_2dm"
+

 class Arithmetic1DComposite(Arithmetic):
-   def get_file_download_info(self):
-        return 'single_digit_three_ops.jsonl', '08b34e3272a8ff1d4932d63f251519d14c485c38d582366e1e323d0b859c3925'
+    DATASET_NAME = "arithmetic_1dc"
--- a/lm_eval/tasks/asdiv.py
+++ b/lm_eval/tasks/asdiv.py
@@ -2,63 +2,43 @@
 ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers
 https://arxiv.org/abs/2106.15772

+ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
+patterns and problem types) English math word problem (MWP) corpus for evaluating
+the capability of various MWP solvers. Existing MWP corpora for studying AI progress
+remain limited either in language usage patterns or in problem types. We thus present
+a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
+types taught in elementary school. Each MWP is annotated with its problem type and grade
+level (for indicating the level of difficulty).
+
+NOTE: We currently ignore formulas for answer generation.
+
+Homepage: https://github.com/chaochun/nlu-asdiv-dataset
+"""
+import inspect
+import lm_eval.datasets.asdiv.asdiv
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+
+_CITATION = """
 @misc{miao2021diverse,
-      title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
-      author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
-      year={2021},
-      eprint={2106.15772},
-      archivePrefix={arXiv},
-      primaryClass={cs.AI}
+    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
+    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
+    year={2021},
+    eprint={2106.15772},
+    archivePrefix={arXiv},
+    primaryClass={cs.AI}
 }
 """
-from lm_eval.base import Task
-from pathlib import Path
-from best_download import download_file 
-import xml.etree.ElementTree as ET
-from lm_eval.base import rf
-from lm_eval.metrics import mean,perplexity
-import numpy as np
-from zipfile import ZipFile
-import os 
-
-#currently ignoring formula for answer generation
-
-# given a subset, splits return the docs 
+
+
 class Asdiv(Task):
    VERSION = 0
-    DATASET_PATH = Path("data/asdiv")
-
-    def download(self):
-        if self.DATASET_PATH.exists():
-            return
-        Path.mkdir(self.DATASET_PATH, parents=True)
-        url = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
-        checksum = "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"
-        zip_path = self.DATASET_PATH / "55790e5270bb91ccfa5053194b25732534696b50.zip"
-        download_file(url, local_file=str(zip_path), expected_checksum=checksum)
-        with ZipFile(zip_path, "r") as zip:
-            zip.extractall(self.DATASET_PATH)
-        os.remove(zip_path)
-
-    def _convert_standard(self, problem):
-        #TODO: include solution-type and formula
-        out_doc = {
-            "question" : problem.find('Question').text,
-            "body" : problem.find('Body').text,
-            "answer": problem.find('Answer').text
-        }
-        return out_doc
-
-    def load_docs(self, textfilename, tfds=False):
-        tree = ET.parse(textfilename)
-        root = tree.getroot()
-        for pid, problem in enumerate(root.iter('Problem')):
-            out_doc = self._convert_standard(problem)
-            yield out_doc
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.asdiv.asdiv)

    def has_training_docs(self):
        return False
-    
+
    def has_validation_docs(self):
        return True

@@ -68,13 +48,12 @@ class Asdiv(Task):
    def training_docs(self):
        raise NotImplementedError("This dataset has no training docs")

+    def validation_docs(self):
+        return self.dataset["validation"]
+
    def test_docs(self):
        raise NotImplementedError("This dataset has no test docs")

-    def validation_docs(self):
-        data_xml_path = self.DATASET_PATH / "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50/dataset/ASDiv.xml"
-        return self.load_docs(data_xml_path)
-
    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
        assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting."
        return super().fewshot_context(
@@ -83,11 +62,6 @@ class Asdiv(Task):
            rnd=rnd,
            description=description
        )
-    
-    def fewshot_description(self):
-        # TODO: add solution-type and formula
-        desc = "information containing the context of the question\nQuestion: Text of a question.\nAnswer: Answer to the question, based on the passage.\n"
-        return desc

    def doc_to_text(self, doc):
        # TODO: add solution-type

--- a/lm_eval/tasks/blimp.py
+++ b/lm_eval/tasks/blimp.py
@@ -2,32 +2,53 @@
 BLiMP: A Benchmark of Linguistic Minimal Pairs for English
 https://arxiv.org/abs/1912.00582

+BLiMP is a challenge set for evaluating what language models (LMs) know about
+major grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each
+containing 1000 minimal pairs isolating specific contrasts in syntax, morphology,
+or semantics. The data is automatically generated according to expert-crafted
+grammars.
+
+Homepage: https://github.com/alexwarstadt/blimp
+"""
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+
+_CITATION = """
 @article{warstadt2019blimp,
-  title={BLiMP: A Benchmark of Linguistic Minimal Pairs for English},
-  author={Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei, and Wang, Sheng-Fu and Bowman, Samuel R},
-  journal={arXiv preprint arXiv:1912.00582},
-  year={2019}
+    author = {Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei and Wang, Sheng-Fu and Bowman, Samuel R.},
+    title = {BLiMP: The Benchmark of Linguistic Minimal Pairs for English},
+    journal = {Transactions of the Association for Computational Linguistics},
+    volume = {8},
+    number = {},
+    pages = {377-392},
+    year = {2020},
+    doi = {10.1162/tacl\_a\_00321},
+    URL = {https://doi.org/10.1162/tacl_a_00321},
+    eprint = {https://doi.org/10.1162/tacl_a_00321},
+    abstract = { We introduce The Benchmark of Linguistic Minimal Pairs (BLiMP),1 a challenge set for evaluating the linguistic knowledge of language models (LMs) on major grammatical phenomena in English. BLiMP consists of 67 individual datasets, each containing 1,000 minimal pairs—that is, pairs of minimally different sentences that contrast in grammatical acceptability and isolate specific phenomenon in syntax, morphology, or semantics. We generate the data according to linguist-crafted grammar templates, and human aggregate agreement with the labels is 96.4\%. We evaluate n-gram, LSTM, and Transformer (GPT-2 and Transformer-XL) LMs by observing whether they assign a higher probability to the acceptable sentence in each minimal pair. We find that state-of-the-art models identify morphological contrasts related to agreement reliably, but they struggle with some subtle semantic and syntactic phenomena, such as negative polarity items and extraction islands. }
 }
 """

-from lm_eval.base import rf
-from lm_eval.metrics import mean
-from .common import HFTask
-

-class BlimpTask(HFTask):
+class BlimpTask(Task):
    VERSION = 0
    DATASET_PATH = "blimp"

-    def download(self):
-        super().download()
+    def has_training_docs(self):
+        return False
+    
+    def has_validation_docs(self):
+        return True

+    def has_test_docs(self):
+        return False
+
+    def validation_docs(self):
        # The HF dataset only contains a "train" dataset, but the harness expects a "validation"
        # dataset. Let's use the training dataset, on the assumption that the model wasn't actually
        # trained on this data.
-
-        self.data["validation"] = self.data["train"]
-        del self.data["train"]
+        return self.dataset["train"]

    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
        assert num_fewshot == 0

--- a/lm_eval/tasks/cbt.py
+++ b/lm_eval/tasks/cbt.py
+"""
+The Children’s Book Test (CBT) from the paper:
+https://research.fb.com/wp-content/uploads/2016/11/the_goldilocks_principle_reading_children_s_books_with_explicit_memory_representations.pdf
+
+The Children's Book Test (CBT) is test of how well language models capture 
+meaning in children's books. Unlike standard language modelling benchmarks,
+it distinguishes the task of predicting syntactic function words from that
+of predicting lower-frequency words, which carry greater semantic content.
+
+NOTE: This evaluation is based on the (context + query) question-answering variant
+used by the Recurrent Language Models described in the paper. See section 4.4.
+
+Homepage: https://github.com/facebookresearch/ParlAI/tree/main/parlai/tasks/cbt
+"""
 import numpy as np
-from lm_eval.base import rf
+from lm_eval.base import rf, Task
 from lm_eval.metrics import mean
-from .common import HFTask


-class CBTBase(HFTask):
-    """The Children’s Book Test (CBT) from the paper:
-    https://research.fb.com/wp-content/uploads/2016/11/the_goldilocks_principle_reading_children_s_books_with_explicit_memory_representations.pdf
-    NOTE: This evaluation is based on the (context + query) question-answering variant
-    used by the Recurrent Language Models described in the aforementioned paper.
-    See section 4.4.
-    """
+_CITATION = """
+@misc{hill2016goldilocks,
+    title={The Goldilocks Principle: Reading Children's Books with Explicit Memory Representations}, 
+    author={Felix Hill and Antoine Bordes and Sumit Chopra and Jason Weston},
+    year={2016},
+    eprint={1511.02301},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+

+class CBTBase(Task):
+    VERSION = 0
    DATASET_PATH = "cbt"
    DATASET_NAME = None

-    VERSION = 0
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        return self.dataset["test"]

    def detokenize(self, text):
        text = text.replace(" '", "'")

--- a/lm_eval/tasks/common.py
+++ b/lm_eval/tasks/common.py
-import datasets
-from ..base import Task
-
-
-class HFTask(Task):
-    DATASET_PATH = None
-    DATASET_NAME = None
-
-    def __init__(self):
-        self.data = None
-        super().__init__()
-
-    def download(self):
-        self.data = datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)
-
-    def has_training_docs(self):
-        """Whether the task has a training set"""
-        return True if "train" in self.data.keys() else False
-
-    def has_validation_docs(self):
-        """Whether the task has a validation set"""
-        return True if "validation" in self.data.keys() else False
-
-    def has_test_docs(self):
-        """Whether the task has a test set"""
-        return True if "test" in self.data.keys() else False
-
-    def _convert_standard(self, doc):
-        return doc
-
-    def training_docs(self):
-        # Cache training for faster few-shot.
-        # If data is too large to fit in memory, override this method.
-        if self.has_training_docs():
-            if self._training_docs is None:
-                self._training_docs = list(map(self._convert_standard, self.data["train"]))
-            return self._training_docs
-
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return map(self._convert_standard, self.data["validation"])
-
-    def test_docs(self):
-        if self.has_test_docs():
-            return map(self._convert_standard, self.data["test"])
-
-
-def yesno(x):
-    if x:
-        return 'yes'
-    else:
-        return 'no'
--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
-import os
-import json
+"""
+CoQA: A Conversational Question Answering Challenge
+https://arxiv.org/pdf/1808.07042.pdf
+
+CoQA is a large-scale dataset for building Conversational Question Answering 
+systems. The goal of the CoQA challenge is to measure the ability of machines to 
+understand a text passage and answer a series of interconnected questions that 
+appear in a conversation.
+
+Homepage: https://stanfordnlp.github.io/coqa/
+"""
+import inspect
 import transformers.data.metrics.squad_metrics as squad_metrics
+import lm_eval.datasets.coqa.coqa
 from lm_eval.base import Task, rf, mean
-from ..utils import sh
 from itertools import zip_longest
-from best_download import download_file


-class CoQA(Task):
-    VERSION = 1
-
-    def download(self):
-        coqa_train_filepath = 'data/coqa/coqa-train-v1.0.json'
-        coqa_dev_filepath = 'data/coqa/coqa-dev-v1.0.json'
+_CITATION = """
+@misc{reddy2018coqa,
+    title={CoQA: A Conversational Question Answering Challenge},
+    author={Siva Reddy and Danqi Chen and Christopher D. Manning},
+    year={2018},
+    eprint={1808.07042},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""

-        sh ("""mkdir -p data/coqa""")

-        download_file("http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json", local_file=coqa_train_filepath, expected_checksum="b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6")
-        download_file("http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-dev-v1.0.json", local_file=coqa_dev_filepath, expected_checksum="dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a")
+class CoQA(Task):
+    VERSION = 1
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.coqa.coqa)
+    DATASET_NAME = None

    def has_training_docs(self):
        return True
@@ -29,10 +43,10 @@ class CoQA(Task):
        return False

    def training_docs(self):
-        return json.load(open('data/coqa/coqa-train-v1.0.json'))['data']
+        return self.dataset["train"]

    def validation_docs(self):
-        return json.load(open('data/coqa/coqa-dev-v1.0.json'))['data']
+        return self.dataset["validation"]

    def test_docs(self):
        pass
@@ -41,9 +55,9 @@ class CoQA(Task):
        # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1} 
        # and a question qi, the task is to predict the answer ai
        doc_text = doc["story"] + '\n\n'
-        for (q, a) in zip_longest(doc["questions"], doc["answers"][:-1]):   # omit target answer ai
-            question = f"Q: {q['input_text']}" + '\n\n'
-            answer = f"A: {a['input_text']}" + '\n\n' if a is not None else "A:"
+        for (q, a) in zip_longest(doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]):   # omit target answer ai
+            question = f"Q: {q}\n\n"
+            answer = f"A: {a}\n\n" if a is not None else "A:"
            doc_text += question + answer
        return doc_text
        
@@ -57,13 +71,13 @@ class CoQA(Task):
    def get_answers(cls, doc, turn_id):
        # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
        answers = []
-        answer_forturn = doc["answers"][turn_id - 1]["input_text"]
+        answer_forturn = doc["answers"]["input_text"][turn_id - 1]
        answers.append(answer_forturn)
        
        additional_answers = doc.get("additional_answers")
        if additional_answers:
            for key in additional_answers:
-                additional_answer_for_turn = additional_answers[key][turn_id - 1]["input_text"]
+                additional_answer_for_turn = additional_answers[key]["input_text"][turn_id - 1]
                if additional_answer_for_turn.lower() not in map(str.lower, answers):
                    answers.append(additional_answer_for_turn)
        return answers
@@ -103,8 +117,8 @@ class CoQA(Task):
    def doc_to_target(self, doc, turnid=None):
        # Default to prediction of last turn.
        if turnid is None:
-            turnid = len(doc["questions"])
-        raw_text = doc['answers'][turnid - 1]["input_text"]
+            turnid = len(doc["questions"]["input_text"])
+        raw_text = doc['answers']["input_text"][turnid - 1]
        return " " + raw_text

    def construct_requests(self, doc, ctx):
@@ -131,7 +145,7 @@ class CoQA(Task):
        :param results:
            The results of the requests created in construct_requests.
        """
-        turn_id = len(doc["questions"])
+        turn_id = len(doc["questions"]["input_text"])
        gold_list = self.get_answers(doc, turn_id)
        pred = results[0].strip().split('\n')[0]


--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
-import json
+"""
+DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs
+https://aclanthology.org/attachments/N19-1246.Supplementary.pdf
+
+DROP is a QA dataset which tests comprehensive understanding of paragraphs. In 
+this crowdsourced, adversarially-created, 96k question-answering benchmark, a 
+system must resolve multiple references in a question, map them onto a paragraph,
+and perform discrete operations over them (such as addition, counting, or sorting).
+
+Homepage: https://allenai.org/data/drop
+
+Acknowledgement: This implementation is based on the official evaluation for `DROP`:
+https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
+"""
+import inspect
 import numpy as np
 import re
 import string
-from best_download import download_file
+import lm_eval.datasets.drop.drop
 from scipy.optimize import linear_sum_assignment
 from lm_eval.base import Task, rf
 from lm_eval.metrics import mean
-from pathlib import Path
-from zipfile import ZipFile

+
+_CITATION = """
+@misc{dua2019drop,
+    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, 
+    author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
+    year={2019},
+    eprint={1903.00161},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
 """
-Acknowledgement: This implementation is based on the official evaluation for `DROP`:
-https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
-"""
+

 _ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)

+
 class DROP(Task):
    VERSION = 1
-    DATASET_PATH = Path("data/drop")
-
-    def download(self):
-        if self.DATASET_PATH.exists():
-            return
-        Path.mkdir(self.DATASET_PATH, parents=True)
-        url = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
-        checksum = "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"
-        zip_path = self.DATASET_PATH / "drop_dataset.zip"
-        download_file(url, local_file=str(zip_path), expected_checksum=checksum)
-        with ZipFile(zip_path, "r") as zip:
-            zip.extractall(self.DATASET_PATH)
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.drop.drop)
+    DATASET_NAME = None

    def has_training_docs(self):
        return True
@@ -40,29 +51,46 @@ class DROP(Task):
    def has_test_docs(self):
        return False

-    def _load_docs(self, docs):
-        for doc in docs:
-            for qa in doc["qa_pairs"]:
-                yield {
-                    "id": qa["query_id"],
-                    "passage": doc["passage"],
-                    "question": qa["question"],
-                    "answers": self.get_answers(qa),
-                }
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
+        return self._training_docs
+
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+
+    def _process_doc(self, doc):
+        return {
+            "id": doc["query_id"],
+            "passage": doc["passage"],
+            "question": doc["question"],
+            "answers": self.get_answers(doc),
+        }

    @classmethod
    def get_answers(cls, qa):
+        def _flatten_validated_answers(validated_answers):
+            """ Flattens a dict of lists of validated answers.
+            {"number": ['1', '8'], ...}
+            -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
+            """
+            vas = []
+            for i in range(len(validated_answers["number"])):
+                vas.append({
+                    "number": validated_answers["number"][i],
+                    "date": validated_answers["date"][i],
+                    "spans": validated_answers["spans"][i],
+                })
+            return vas
        answers = []
        answers_set = set()
-
-        candidates = [qa["answer"]] + qa.get("validated_answers", [])
+        candidates = [qa["answer"]] + _flatten_validated_answers(qa["validated_answers"])
        for candidate in candidates:
            answer = cls.parse_answer(candidate)
            if answer in answers_set:
                continue
            answers_set.add(answer)
            answers.append(answer)
-
        return answers

    @classmethod
@@ -76,14 +104,6 @@ class DROP(Task):
                          answer["date"]["month"],
                          answer["date"]["year"]]).strip(),)

-    def training_docs(self):
-        docs = json.load(open(self.DATASET_PATH / "drop_dataset" / "drop_dataset_train.json"))
-        return self._load_docs([docs[k] for k in docs.keys()])
-
-    def validation_docs(self):
-        docs = json.load(open(self.DATASET_PATH / "drop_dataset" / "drop_dataset_dev.json"))
-        return self._load_docs([docs[k] for k in docs.keys()])
-
    def doc_to_text(self, doc):
        return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"


--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
+"""
+GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding
+https://openreview.net/pdf?id=rJ4km2R5t7
+
+The General Language Understanding Evaluation (GLUE) benchmark is a collection of
+resources for training, evaluating, and analyzing natural language understanding
+systems. GLUE consists of:
+- A benchmark of nine sentence- or sentence-pair language understanding tasks built
+on established existing datasets and selected to cover a diverse range of dataset
+sizes, text genres, and degrees of difficulty, and
+- A diagnostic dataset designed to evaluate and analyze model performance with
+respect to a wide range of linguistic phenomena found in natural language.
+
+Homepage: https://gluebenchmark.com/
+"""
 import numpy as np
-from lm_eval.base import rf
-from ..metrics import mean, matthews_corrcoef, f1_score
-from . common import HFTask, yesno
-from ..utils import general_detokenize
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean, matthews_corrcoef, f1_score, yesno
+from lm_eval.utils import general_detokenize
+
+
+# TODO(jon-tow): Add citations for the individual datasets/tasks that make up GLUE.
+_CITATION = """
+@inproceedings{wang-etal-2018-glue,
+    title = "{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding",
+    author = "Wang, Alex  and
+      Singh, Amanpreet  and
+      Michael, Julian  and
+      Hill, Felix  and
+      Levy, Omer  and
+      Bowman, Samuel",
+    booktitle = "Proceedings of the 2018 {EMNLP} Workshop {B}lackbox{NLP}: Analyzing and Interpreting Neural Networks for {NLP}",
+    month = nov,
+    year = "2018",
+    address = "Brussels, Belgium",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/W18-5446",
+    doi = "10.18653/v1/W18-5446",
+    pages = "353--355",
+    abstract = "Human ability to understand language is \textit{general, flexible, and robust}. In contrast, most NLU models above the word level are designed for a specific task and struggle with out-of-domain data. If we aspire to develop models with understanding beyond the detection of superficial correspondences between inputs and outputs, then it is critical to develop a unified model that can execute a range of linguistic tasks across different domains. To facilitate research in this direction, we present the General Language Understanding Evaluation (GLUE, gluebenchmark.com): a benchmark of nine diverse NLU tasks, an auxiliary dataset for probing models for understanding of specific linguistic phenomena, and an online platform for evaluating and comparing models. For some benchmark tasks, training data is plentiful, but for others it is limited or does not match the genre of the test set. GLUE thus favors models that can represent linguistic knowledge in a way that facilitates sample-efficient learning and effective knowledge-transfer across tasks. While none of the datasets in GLUE were created from scratch for the benchmark, four of them feature privately-held test data, which is used to ensure that the benchmark is used fairly. We evaluate baselines that use ELMo (Peters et al., 2018), a powerful transfer learning technique, as well as state-of-the-art sentence representation models. The best models still achieve fairly low absolute scores. Analysis with our diagnostic dataset yields similarly weak performance over all phenomena tested, with some exceptions.",
+}
+"""
+

 # Single-Sentence Tasks


-class CoLA(HFTask):
+class CoLA(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "cola"
@@ -21,6 +59,14 @@ class CoLA(HFTask):
    def has_test_docs(self):
        return False

+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
    def doc_to_text(self, doc):
        return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])

@@ -57,7 +103,7 @@ class CoLA(HFTask):
        }


-class SST(HFTask):
+class SST(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "sst2"
@@ -71,6 +117,14 @@ class SST(HFTask):
    def has_test_docs(self):
        return False

+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
    def doc_to_text(self, doc):
        return "{}\nQuestion: Is this sentence positive or negative?\nAnswer:".format(
            general_detokenize(doc["sentence"]),
@@ -106,7 +160,7 @@ class SST(HFTask):
 # Inference Tasks


-class MNLI(HFTask):
+class MNLI(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "mnli"
@@ -120,13 +174,18 @@ class MNLI(HFTask):
    def has_test_docs(self):
        return False

+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
    def validation_docs(self):
        if self.has_validation_docs():
-            return self.data["validation_matched"]
+            return self.dataset["validation_matched"]

    def test_docs(self):
        if self.has_test_docs():
-            return self.data["test_matched"]
+            return self.dataset["test_matched"]

    def doc_to_text(self, doc):
        return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
@@ -169,14 +228,14 @@ class MNLIMismatched(MNLI):

    def validation_docs(self):
        if self.has_validation_docs():
-            return self.data["validation_mismatched"]
+            return self.dataset["validation_mismatched"]

    def test_docs(self):
        if self.has_test_docs():
-            return self.data["test_mismatched"]
+            return self.dataset["test_mismatched"]


-class QNLI(HFTask):
+class QNLI(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "qnli"
@@ -190,6 +249,14 @@ class QNLI(HFTask):
    def has_test_docs(self):
        return False

+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
    def doc_to_text(self, doc):
        return "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
            doc["question"],
@@ -225,7 +292,7 @@ class QNLI(HFTask):
        }


-class WNLI(HFTask):
+class WNLI(Task):
    VERSION = 1
    DATASET_PATH = "glue"
    DATASET_NAME = "wnli"
@@ -239,6 +306,14 @@ class WNLI(HFTask):
    def has_test_docs(self):
        return False

+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
    def doc_to_text(self, doc):
        return "{}\nQuestion: {} True or False?\nAnswer:".format(
            doc["sentence1"],
@@ -274,7 +349,7 @@ class WNLI(HFTask):
        }


-class RTE(HFTask):
+class RTE(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "rte"
@@ -288,6 +363,14 @@ class RTE(HFTask):
    def has_test_docs(self):
        return False

+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
    def doc_to_text(self, doc):
        return "{}\nQuestion: {} True or False?\nAnswer:".format(
            doc["sentence1"],
@@ -326,7 +409,7 @@ class RTE(HFTask):
 # Similarity and Paraphrase Tasks


-class MRPC(HFTask):
+class MRPC(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "mrpc"
@@ -340,6 +423,14 @@ class MRPC(HFTask):
    def has_test_docs(self):
        return False

+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
    def doc_to_text(self, doc):
        return "Sentence 1: {}\nSentence 2: {}\nQuestion: Do both sentences mean the same thing?\nAnswer:".format(
            general_detokenize(doc["sentence1"]),
@@ -376,7 +467,7 @@ class MRPC(HFTask):
        }


-class QQP(HFTask):
+class QQP(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "qqp"
@@ -390,6 +481,14 @@ class QQP(HFTask):
    def has_test_docs(self):
        return False

+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
    def doc_to_text(self, doc):
        return "Question 1: {}\nQuestion 2: {}\nQuestion: Do both questions ask the same thing?\nAnswer:".format(
            doc["question1"],
@@ -426,7 +525,7 @@ class QQP(HFTask):
        }


-class STSB(HFTask):
+class STSB(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "stsb"
@@ -440,6 +539,17 @@ class STSB(HFTask):
    def has_test_docs(self):
        return True

+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        return self.dataset["test"]
+
    def doc_to_text(self, doc):
        return "sentence 1: {}\nsentence 2: {}\nAnswer:".format(
            doc["sentence1"],

--- a/lm_eval/tasks/gsm8k.py
+++ b/lm_eval/tasks/gsm8k.py
@@ -2,48 +2,48 @@
 "Training Verifiers to Solve Math Word Problems"
 https://arxiv.org/abs/2110.14168

-@misc{cobbe2021training,
-      title={Training Verifiers to Solve Math Word Problems},
-      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
-      year={2021},
-      eprint={2110.14168},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG}
-}
+State-of-the-art language models can match human performance on many tasks, but 
+they still struggle to robustly perform multi-step mathematical reasoning. To 
+diagnose the failures of current models and support research, we introduce GSM8K,
+a dataset of 8.5K high quality linguistically diverse grade school math word problems.
+We find that even the largest transformer models fail to achieve high test performance, 
+despite the conceptual simplicity of this problem distribution.

 NOTE: See the official implementation of the task: 
    https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py
 for how to make use of the dataset's calculator annotations in your language
 model's sample/generation function.
-"""

-import json
+Homepage: https://github.com/openai/grade-school-math
+"""
+import inspect
 import re
-from best_download import download_file
+import lm_eval.datasets.gsm8k.gsm8k
 from pathlib import Path
 from lm_eval.base import Task, rf
 from lm_eval.metrics import mean

+
+_CITATION = """
+@misc{cobbe2021training,
+      title={Training Verifiers to Solve Math Word Problems},
+      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
+      year={2021},
+      eprint={2110.14168},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+"""
+
+
 ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
 INVALID_ANS = "[invalid]"


 class GradeSchoolMath8K(Task):
    VERSION = 0
-    DATASET_PATH = Path('data/gsm8k')
-
-    def download(self):
-        if self.DATASET_PATH.exists():
-            return
-        Path.mkdir(self.DATASET_PATH, parents=True)
-        base_url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data"
-        splits = [
-            {"name": "train", "checksum": "17f347dc51477c50d4efb83959dbb7c56297aba886e5544ee2aaed3024813465"},
-            {"name": "test", "checksum": "3730d312f6e3440559ace48831e51066acaca737f6eabec99bccb9e4b3c39d14"},
-        ]
-        for split in splits:
-            file = self.DATASET_PATH / f"{split['name']}.jsonl"
-            download_file(f"{base_url}/{split['name']}.jsonl", str(file), split["checksum"])
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.gsm8k.gsm8k)
+    DATASET_NAME = None

    def has_training_docs(self):
        return True
@@ -54,17 +54,14 @@ class GradeSchoolMath8K(Task):
    def has_test_docs(self):
        return True

-    def _load_docs(self, file):
-        return (json.loads(line) for line in open(file).read().splitlines())
-
    def training_docs(self):
-        return self._load_docs(self.DATASET_PATH / "train.jsonl")
+        return self.dataset["train"]

    def validation_docs(self):
        raise NotImplementedError

    def test_docs(self):
-        return self._load_docs(self.DATASET_PATH / "test.jsonl")
+        return self.dataset["test"]

    def doc_to_text(self, doc):
        return "Question: " + doc['question'] + '\nAnswer:'

--- a/lm_eval/tasks/headqa.py
+++ b/lm_eval/tasks/headqa.py
-from . common import HFTask
+"""
+Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering
+https://aclanthology.org/P19-1092.pdf
+
+HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to 
+access a specialized position in the Spanish healthcare system, and are challenging
+even for highly specialized humans.
+
+Homepage: https://aghie.github.io/head-qa/
+"""
+import inspect
+import lm_eval.datasets.headqa.headqa
 from lm_eval.base import MultipleChoiceTask


-class HeadQABase(HFTask, MultipleChoiceTask):
+_CITATION = """
+@misc{liu2020interpretable,
+    title={Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering}, 
+    author={Ye Liu and Shaika Chowdhury and Chenwei Zhang and Cornelia Caragea and Philip S. Yu},
+    year={2020},
+    eprint={2008.02434},
+    archivePrefix={arXiv},
+    primaryClass={cs.AI}
+}
+"""
+
+
+class HeadQABase(MultipleChoiceTask):
    VERSION = 0
-    DATASET_PATH = "head_qa"
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.headqa.headqa)

    def has_training_docs(self):
        return True
@@ -15,7 +38,18 @@ class HeadQABase(HFTask, MultipleChoiceTask):
    def has_test_docs(self):
        return True

-    def _convert_standard(self, doc):
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
+        return self._training_docs
+
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+
+    def test_docs(self):
+        return map(self._process_doc, self.dataset["test"])
+
+    def _process_doc(self, doc):
        out_doc = {
            "id": doc["qid"],
            "query": "Question: " + doc["qtext"] + "\nAnswer:",
@@ -33,12 +67,15 @@ class HeadQABase(HFTask, MultipleChoiceTask):
    def doc_to_decontamination_query(self, doc):
        return doc["query"]

+
 class HeadQAEn(HeadQABase):
    DATASET_NAME = "en"

+
 class HeadQAEs(HeadQABase):
    DATASET_NAME = "es"

+
 # for backwards compatibility
 class HeadQAEsDeprecated(HeadQABase):
    DATASET_NAME = "es"

--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
+"""
+HellaSwag: Can a Machine Really Finish Your Sentence?
+https://arxiv.org/pdf/1905.07830.pdf
+
+Hellaswag is a commonsense inference challenge dataset. Though its questions are
+trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). This is
+achieved via Adversarial Filtering (AF), a data collection paradigm wherein a
+series of discriminators iteratively select an adversarial set of machine-generated
+wrong answers. AF proves to be surprisingly robust. The key insight is to scale up
+the length and complexity of the dataset examples towards a critical 'Goldilocks'
+zone wherein generated text is ridiculous to humans, yet often misclassified by
+state-of-the-art models.
+
+Homepage: https://rowanzellers.com/hellaswag/
+"""
 import re
 from lm_eval.base import MultipleChoiceTask
-from . common import HFTask


-class HellaSwag(HFTask, MultipleChoiceTask):
+_CITATION = """
+@inproceedings{zellers2019hellaswag,
+    title={HellaSwag: Can a Machine Really Finish Your Sentence?},
+    author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
+    booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
+    year={2019}
+}
+"""
+
+
+class HellaSwag(MultipleChoiceTask):
    VERSION = 0
    DATASET_PATH = "hellaswag"
    DATASET_NAME = None
@@ -17,16 +41,15 @@ class HellaSwag(HFTask, MultipleChoiceTask):
    def has_test_docs(self):
        return False

-    @classmethod
-    def preprocess(cls, text):
-        text = text.strip()
-        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
-        text = text.replace(" [title]", ". ")
-        text = re.sub('\\[.*?\\]', '', text)
-        text = text.replace("  ", " ")
-        return text
+    def training_docs(self):
+        if self._training_docs is None:
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
+        return self._training_docs
+
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])

-    def _convert_standard(self, doc):
+    def _process_doc(self, doc):
        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
        out_doc = {
            "query": self.preprocess(doc['activity_label'] + ': ' + ctx),
@@ -35,6 +58,15 @@ class HellaSwag(HFTask, MultipleChoiceTask):
        }
        return out_doc

+    @classmethod
+    def preprocess(cls, text):
+        text = text.strip()
+        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+        text = text.replace(" [title]", ". ")
+        text = re.sub('\\[.*?\\]', '', text)
+        text = text.replace("  ", " ")
+        return text
+
    def doc_to_text(self, doc):
        return doc["query"]


--- a/lm_eval/tasks/hendrycks_ethics.py
+++ b/lm_eval/tasks/hendrycks_ethics.py
+"""
+Aligning AI With Shared Human Values
+https://arxiv.org/pdf/2008.02275.pdf
+
+The ETHICS dataset is a benchmark that spans concepts in justice, well-being,
+duties, virtues, and commonsense morality. Models predict widespread moral
+judgments about diverse text scenarios. This requires connecting physical and
+social world knowledge to value judgements, a capability that may enable us
+to steer chatbot outputs or eventually regularize open-ended reinforcement
+learning agents.
+
+NOTE: The reported "group" accuracies for the Deontology, Justice, and Virtue
+tasks are refered to in this work as the `em` sub-metric. See Section 3. Metrics.
+of the paper.
+
+Homepage: https://github.com/hendrycks/ethics
+"""
 import abc
-import csv
-import os
 import random
+import inspect
+import lm_eval.datasets.hendrycks_ethics.hendrycks_ethics
 import numpy as np
 from lm_eval.base import Task, rf
-from lm_eval.metrics import mean
-from lm_eval.utils import sh
-from .common import yesno
-from best_download import download_file
+from lm_eval.metrics import mean, yesno

-"""
-NOTE: The reported "group" accuracies for the Deontology, Justice, and Virtue
-tasks are refered to in this work as the `em` sub-metric. See Section 3. Metrics.
-of the paper.
+
+_CITATION = """
+@article{hendrycks2021ethics,
+    title={Aligning AI With Shared Human Values},
+    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
+    journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+    year={2021}
+}
 """


 class Ethics(Task):
-    def download(self):
-        if not os.path.exists('data/ethics/done'):
-            sh("mkdir -p data")
-            download_file("https://people.eecs.berkeley.edu/~hendrycks/ethics.tar", local_file="data/ethics.tar", expected_checksum="40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333")
-            sh("""
-            tar -xf data/ethics.tar -C data/
-            rm data/ethics.tar
-            touch data/ethics/done
-            """)
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.hendrycks_ethics.hendrycks_ethics)
+    DATASET_NAME = None

    def has_training_docs(self):
        return True
@@ -36,30 +47,16 @@ class Ethics(Task):
    def has_test_docs(self):
        return True

-    @abc.abstractmethod
-    def process_doc(self, doc):
-        pass
-
-    def load_doc(self, filename):
-        with open(filename, newline='') as file:
-            filereader = csv.reader(file)
-            return self.process_doc(list(filereader))
-
-    @abc.abstractmethod
-    def get_prefix(self):
-        """returns string corresponding to file prefix"""
-        pass
-
    # TODO: Figure out how to incorporate the Ethics `hard` test sets.

    def training_docs(self):
-        return self.load_doc(f"data/ethics/{self.get_prefix()}_train.csv")
+        return self.dataset["train"]

    def validation_docs(self):
        raise NotImplementedError

    def test_docs(self):
-        return self.load_doc(f"data/ethics/{self.get_prefix()}_test.csv")
+        return self.dataset["test"]

    @abc.abstractmethod
    def doc_to_text(self, doc):
@@ -88,15 +85,10 @@ class Ethics(Task):

 class EthicsCM(Ethics):
    VERSION = 0
-    # Ignoring "ambiguous" extra dataset for now
-    def get_prefix(self):
-        return "commonsense/cm"
-
-    def process_doc(self, doc):
-        return doc[1:]
+    DATASET_NAME = "commonsense"  # Ignoring "ambiguous" extra dataset for now

    def doc_to_text(self, doc):
-        return "{}\nQuestion: Is this wrong?\nAnswer:".format(doc[1])
+        return "{}\nQuestion: Is this wrong?\nAnswer:".format(doc["input"])

    def should_decontaminate(self):
        return True
@@ -105,7 +97,7 @@ class EthicsCM(Ethics):
        return doc[1]

    def doc_to_target(self, doc):
-        return " {}".format(yesno(int(doc[0])))
+        return " {}".format(yesno(int(doc["label"])))

    def construct_requests(self, doc, ctx):
        ll_yes, _ = rf.loglikelihood(ctx, " yes")
@@ -115,7 +107,7 @@ class EthicsCM(Ethics):
    def process_results(self, doc, results):
        ll_yes, ll_no = results
        pred = ll_yes > ll_no
-        gold = bool(int(doc[0]))
+        gold = bool(int(doc["label"]))
        return {
            "acc": pred == gold
        }
@@ -133,15 +125,10 @@ class EthicsCM(Ethics):

 class EthicsDeontology(Ethics):
    VERSION = 0
-    def get_prefix(self):
-        return "deontology/deontology"
-
-    def process_doc(self, doc):
-        # Append identifiers before shuffling to calculate exact matches lateron & skip the first element of headers
-        return [x + [i] for i, x in enumerate(doc[1:])]
+    DATASET_NAME = "deontology"

    def doc_to_text(self, doc):
-        prompt = " ".join([doc[1], doc[2]])
+        prompt = " ".join([doc["scenario"], doc["excuse"]])
        return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(prompt)

    def should_decontaminate(self):
@@ -151,7 +138,7 @@ class EthicsDeontology(Ethics):
        return " ".join([doc[1], doc[2]])

    def doc_to_target(self, doc):
-        target = ["unreasonable", "reasonable"][int(doc[0])]
+        target = ["unreasonable", "reasonable"][int(doc["label"])]
        return " {}".format(target)

    def construct_requests(self, doc, ctx):
@@ -161,14 +148,15 @@ class EthicsDeontology(Ethics):

    def process_results(self, doc, results):
        pred = np.argmax(results)
-        gold = bool(int(doc[0]))
+        gold = bool(int(doc["label"]))
        return {
            "acc": pred == gold,
-            "em": [doc[-1], pred == gold]
+            "em": [doc["group_id"], pred == gold]
        }

    def calc_em(self, items):
        # Calculate exact matches - i.e. all in a pair of 4 are correct
+        # NOTE: `items` is a tuple of (doc["group_id"], is_correct)
        preds_sort = sorted(items, key=lambda x: x[0])
        em_sums = [int(preds_sort[4*i][1]) + int(preds_sort[4*i+1][1]) + int(preds_sort[4*i+2][1]) + int(preds_sort[4*i+3][1]) for i in range(len(preds_sort) // 4)]
        em_cors = [em_sums[i] == 4 for i in range(len(em_sums))]
@@ -189,15 +177,10 @@ class EthicsDeontology(Ethics):

 class EthicsJustice(Ethics):
    VERSION = 0
-    def get_prefix(self):
-        return "justice/justice"
-
-    def process_doc(self, doc):
-        # Append identifiers before shuffling to calculate exact matches later on & skip the first element of headers
-        return [x + [i] for i, x in enumerate(doc[1:])]
+    DATASET_NAME = "justice"

    def doc_to_text(self, doc):
-        return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(doc[1])
+        return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(doc["scenario"])

    def should_decontaminate(self):
        return True
@@ -206,7 +189,7 @@ class EthicsJustice(Ethics):
        return doc[1]

    def doc_to_target(self, doc):
-        target = ["unreasonable", "reasonable"][int(doc[0])]
+        target = ["unreasonable", "reasonable"][int(doc["label"])]
        return " {}".format(target)

    def construct_requests(self, doc, ctx):
@@ -216,14 +199,15 @@ class EthicsJustice(Ethics):

    def process_results(self, doc, results):
        pred = np.argmax(results)
-        gold = bool(int(doc[0]))
+        gold = bool(int(doc["label"]))
        return {
            "acc": pred == gold,
-            "em": [doc[-1], pred == gold]
+            "em": [doc["group_id"], pred == gold]
        }

    def calc_em(self, items):
        # Calculate exact matches - i.e. all in a pair of 4 are correct
+        # NOTE: `items` is a tuple of (doc["group_id"], is_correct)
        preds_sort = sorted(items, key=lambda x: x[0])
        em_sums = [int(preds_sort[4*i][1]) + int(preds_sort[4*i+1][1]) + int(preds_sort[4*i+2][1]) + int(preds_sort[4*i+3][1]) for i in range(len(preds_sort) // 4)]
        em_cors = [em_sums[i] == 4 for i in range(len(em_sums))]
@@ -244,17 +228,12 @@ class EthicsJustice(Ethics):

 class EthicsUtilitarianismOriginal(Ethics):
    VERSION = 0
-    def get_prefix(self):
-        return "utilitarianism/util"
+    DATASET_NAME = "utilitarianism"

    def has_training_docs(self):
        # Rely on the fixed and labeled examples of `fewshot_examples` for the few-shot setting.
        return False

-    def process_doc(self, docs):
-        for doc in docs:
-            yield {"activity": doc[0], "baseline": doc[1], "rating": ""}
-
    def fewshot_examples(self, k, rnd):
        # Overwriting fewshot examples as k can be max 5
        assert k <= 5, "There are only 5 possible shots for this task. Refer to the V2 for more."
@@ -314,25 +293,36 @@ class EthicsUtilitarianismOriginal(Ethics):


 class EthicsUtilitarianism(Ethics):
-    VERSION = 0
    """
    This is a variation of the original Utilitarianism task used in the paper, where the situations are directly compared.
    This allows scaling to >5 shots.
    """
+    VERSION = 0
+    DATASET_NAME = "utilitarianism"
+
+    def training_docs(self):
+        rnd = random.Random()
+        for doc in self.dataset["train"]:
+            yield self._process_doc(doc, rnd)

-    def get_prefix(self):
-        return "utilitarianism/util"
+    def validation_docs(self):
+        raise NotImplementedError

-    def process_doc(self, docs):
+    def test_docs(self):
        rnd = random.Random()
-        for doc in docs:
-            rnd.seed(doc[0])
-            ordering = [0, 1]
-            rnd.shuffle(ordering)
-            yield {
-                "scenarios": [doc[ordering[0]], doc[ordering[1]]],
-                "label": int(ordering.index(0) == 0),  # The correct scenario is always first
-            }
+        for doc in self.dataset["test"]:
+            yield self._process_doc(doc, rnd)
+
+    def _process_doc(self, doc, rnd):
+        rnd.seed(doc["activity"])
+        scenarios = [doc["activity"], doc["baseline"]]
+        ordering = [0, 1]
+        rnd.shuffle(ordering)
+        return {
+            "scenarios": [scenarios[ordering[0]], scenarios[ordering[1]]],
+            # The correct scenario is always first
+            "label": int(ordering.index(0) == 0),
+        }

    def doc_to_text(self, doc):
        return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferrable?\nAnswer:".format(
@@ -368,23 +358,19 @@ class EthicsUtilitarianism(Ethics):

 class EthicsVirtue(Ethics):
    VERSION = 0
-    def get_prefix(self):
-        return "virtue/virtue"
+    DATASET_NAME = "virtue"

-    def process_doc(self, doc):
-        # Append identifiers before shuffling to calculate exact matches lateron & skip the first element of headers
-        return [x + [i] for i, x in enumerate(doc[1:])]
-
-    def load_doc(self, filename):
-        with open(filename, newline='') as file:
-            filereader = csv.reader(file)
-            return self.process_doc(list(filereader))
+    def _process_doc(self, doc):
+        return doc

    def doc_to_text(self, doc):
-        return "Sentence: {}\nQuestion: Does the character in this sentence exhibit the trait \"{}\"?\nAnswer:".format(*doc[1].split(" [SEP] "))
+        return "Sentence: {}\nQuestion: Does the character in this sentence exhibit the trait \"{}\"?\nAnswer:".format(
+            doc["scenario"],
+            doc["trait"]
+        )

    def doc_to_target(self, doc):
-        return " {}".format(yesno(int(doc[0])))
+        return " {}".format(yesno(int(doc["label"])))

    def construct_requests(self, doc, ctx):
        ll_yes, _ = rf.loglikelihood(ctx, " yes")
@@ -394,14 +380,15 @@ class EthicsVirtue(Ethics):
    def process_results(self, doc, results):
        ll_yes, ll_no = results
        pred = ll_yes > ll_no
-        gold = bool(int(doc[0]))
+        gold = bool(int(doc["label"]))
        return {
            "acc": pred == gold,
-            "em": [doc[-1], pred == gold]
+            "em": [doc["group_id"], pred == gold]
        }

    def calc_em(self, items):
        # Calculate exact matches - i.e. all in a pair of 5 are correct
+        # NOTE: `items` is a tuple of (doc["group_id"], is_correct)
        preds_sort = sorted(items, key=lambda x: x[0])
        em_sums = [int(preds_sort[5*i][1]) + int(preds_sort[5*i+1][1]) + int(preds_sort[5*i+2][1]) + int(preds_sort[5*i+3][1]) + int(preds_sort[5*i+4][1]) for i in range(len(preds_sort) // 5)]
        em_cors = [em_sums[i] == 5 for i in range(len(em_sums))]

--- a/lm_eval/tasks/hendrycks_math.py
+++ b/lm_eval/tasks/hendrycks_math.py
-import abc
-import json
-from lm_eval.utils import sh
+"""
+Measuring Mathematical Problem Solving With the MATH Dataset
+https://arxiv.org/pdf/2103.03874.pdf
+
+Math is a dataset of 12,500 challenging competition mathematics problems. Each
+problem in Math has a full step-by-step solution which can be used to teach
+models to generate answer derivations and explanations.
+
+Homepage: https://github.com/hendrycks/math
+"""
+import inspect
+import lm_eval.datasets.hendrycks_math.hendrycks_math
 from lm_eval.metrics import mean
 from lm_eval.base import Task, rf
-from pathlib import Path
-from best_download import download_file
+
+
+_CITATION = """
+@article{hendrycksmath2021,
+  title={Measuring Mathematical Problem Solving With the Math Dataset},
+  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
+  journal={NeurIPS},
+  year={2021}
+}
+"""


 class Math(Task):
-    """
-    This dataset is based on the following paper:
-    https://arxiv.org/abs/2103.03874 
-    """
-
-    DATASET_PATH = Path('data/MATH')
-
-    def download(self):
-        if not (self.DATASET_PATH / 'test').exists() or not (self.DATASET_PATH / 'done').exists():
-            sh(f"mkdir -p {self.DATASET_PATH}")
-            download_file("https://people.eecs.berkeley.edu/~hendrycks/MATH.tar", local_file=f"{self.DATASET_PATH}.tar", expected_checksum="0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac")
-            sh(f"""
-            tar -xf {self.DATASET_PATH}.tar -C data/ && touch {self.DATASET_PATH / 'done'}
-            rm {self.DATASET_PATH}.tar
-            """)
-
-    @abc.abstractmethod
-    def get_file_info(self):
-        """returns directory name"""
-        pass
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.hendrycks_math.hendrycks_math)
+    DATASET_NAME = None

    def has_training_docs(self):
        return True
@@ -38,22 +37,19 @@ class Math(Task):
    def has_test_docs(self):
        return True

-    def _load_docs(self, path):
-        for file in sorted(path.iterdir()):
-            with open(file) as f:
-                doc = json.load(f)
-                doc["answer"] = self.remove_boxed(
-                    self.last_boxed_only_string(doc["solution"]))
-                yield doc
-
    def training_docs(self):
-        return self._load_docs(self.DATASET_PATH / "train" / self.get_file_info())
+        return map(self._load_doc, self.dataset["train"])

    def validation_docs(self):
        return NotImplemented

    def test_docs(self):
-        return self._load_docs(self.DATASET_PATH / "test" / self.get_file_info())
+        return map(self._load_doc, self.dataset["test"])
+
+    def _load_doc(self, doc):
+        doc["answer"] = self.remove_boxed(
+            self.last_boxed_only_string(doc["solution"]))
+        return doc

    def doc_to_text(self, doc):
        return "Problem: " + doc["problem"] + "\nAnswer:"
@@ -65,7 +61,7 @@ class Math(Task):
        return doc["problem"]

    def doc_to_target(self, doc):
-        return " " + doc["answer"]
+        return " " + doc["solution"]

    def construct_requests(self, doc, ctx):
        return rf.greedy_until(ctx, ["\n"])
@@ -292,41 +288,34 @@ class Math(Task):

 class MathAlgebra(Math):
    VERSION = 1
-    def get_file_info(self):
-        return 'algebra'
+    DATASET_NAME = 'algebra'


 class MathCountingAndProbability(Math):
    VERSION = 1
-    def get_file_info(self):
-        return 'counting_and_probability'
+    DATASET_NAME = 'counting_and_probability'


 class MathGeometry(Math):
    VERSION = 1
-    def get_file_info(self):
-        return 'geometry'
+    DATASET_NAME = 'geometry'


 class MathIntermediateAlgebra(Math):
    VERSION = 1
-    def get_file_info(self):
-        return 'intermediate_algebra'
+    DATASET_NAME = 'intermediate_algebra'


 class MathNumberTheory(Math):
    VERSION = 1
-    def get_file_info(self):
-        return 'number_theory'
+    DATASET_NAME = 'number_theory'


 class MathPrealgebra(Math):
    VERSION = 1
-    def get_file_info(self):
-        return 'prealgebra'
+    DATASET_NAME = 'prealgebra'


 class MathPrecalculus(Math):
    VERSION = 1
-    def get_file_info(self):
-        return 'precalculus'
+    DATASET_NAME = 'precalculus'
--- a/lm_eval/tasks/hendrycks_test.py
+++ b/lm_eval/tasks/hendrycks_test.py
-import csv
-import random
+"""
+Measuring Massive Multitask Language Understanding
+https://arxiv.org/pdf/2009.03300.pdf
+
+The Hendryck's Test is a benchmark that measured a text model’s multitask accuracy.
+The test covers 57 tasks including elementary mathematics, US history, computer 
+science, law, and more. To attain high accuracy on this test, models must possess
+extensive world knowledge and problem solving ability. By comprehensively evaluating
+the breadth and depth of a model’s academic and professional understanding, 
+Hendryck's Test can be used to analyze models across many tasks and to identify 
+important shortcomings.
+
+Homepage: https://github.com/hendrycks/test
+"""
 from lm_eval.base import MultipleChoiceTask
-from ..utils import sh
-from pathlib import Path
-from best_download import download_file
+
+
+_CITATION = """
+@article{hendryckstest2021,
+    title={Measuring Massive Multitask Language Understanding},
+    author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
+    journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+    year={2021}
+}
+"""
+

 SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
            'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
@@ -36,25 +56,15 @@ def create_task(subject):

 class GeneralHendrycksTest(MultipleChoiceTask):
    VERSION = 0
-    DATASET_PATH = Path("data/hendrycksTest/")
+    DATASET_PATH = "hendrycks_test"
+    DATASET_NAME = None

    def __init__(self, subject):
-        self.subject = subject
+        self.DATASET_NAME = subject
        super().__init__()

-    def download(self):
-        if not (self.DATASET_PATH / 'done').exists():
-            sh("mkdir -p data")
-            download_file("https://people.eecs.berkeley.edu/~hendrycks/data.tar", local_file="data/data.tar", expected_checksum="78a804365a59028188fb19bd1adcadc5e0c260b220a9d8b2e33a5ea7d5fbe3b4")
-            sh("""
-            tar -xf data/data.tar -C data/
-            rm data/data.tar
-            mv data/data data/hendrycksTest
-            touch data/hendrycksTest/done
-            """)
-
    def has_training_docs(self):
-        return True
+        return False

    def has_validation_docs(self):
        return True
@@ -62,8 +72,14 @@ class GeneralHendrycksTest(MultipleChoiceTask):
    def has_test_docs(self):
        return True

-    def _convert_standard(self, doc):
-        def format_example(doc, choices):
+    def validation_docs(self):
+        return map(self._process_doc, self.dataset["validation"])
+
+    def test_docs(self):
+        return map(self._process_doc, self.dataset["test"])
+
+    def _process_doc(self, doc):
+        def format_example(doc, keys):
            """
                Question: <prompt>
                Choices:
@@ -73,44 +89,23 @@ class GeneralHendrycksTest(MultipleChoiceTask):
                D. <choice4>
                Answer:
            """
-            prompt = "Question: " + doc[0] + "\nChoices:\n"
-            prompt += "".join([f"{choices[j]}. {doc[j+1]}\n" for j in range(4)])
+            prompt = "Question: " + doc["question"] + "\nChoices:\n"
+            prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])])
            prompt += "Answer:"
            return prompt
-        choices = ['A', 'B', 'C', 'D']
+        keys = ['A', 'B', 'C', 'D']
        return {
-            "query": format_example(doc, choices),
-            "choices": doc[1:5],
-            "gold": choices.index(doc[5])
+            "query": format_example(doc, keys),
+            "choices": doc["choices"],
+            "gold": keys.index(doc["answer"]) if isinstance(doc["answer"], str) else doc["answer"]
        }

-    def _load_docs(self, filename):
-        reader = csv.reader(open(filename, 'r'), quotechar='"', delimiter=',')
-        return (self._convert_standard(doc) for doc in reader)
-
-    def training_docs(self):
-        docs = []
-        for train_dir in ["auxiliary_train", "dev"]:
-            for f in (self.DATASET_PATH / train_dir).iterdir():
-                docs.extend(self._load_docs(f))
-        return docs
-
-    def validation_docs(self):
-        filename = self.DATASET_PATH / "val" / f"{self.subject}_val.csv"
-        return self._load_docs(filename)
-
-    def test_docs(self):
-        filename = self.DATASET_PATH / "test" / f"{self.subject}_test.csv"
-        return self._load_docs(filename)
-
    def fewshot_examples(self, k, rnd):
        # fewshot_examples is not just sampling from train_docs because dev is 
        # in the same distribution as val/test but auxiliary_train isn't

-        filename = self.DATASET_PATH / "dev" / f"{self.subject}_dev.csv"
-
        if self._fewshot_docs is None:
-            self._fewshot_docs = list(self._load_docs(filename))
+            self._fewshot_docs = list(map(self._process_doc, self.dataset["dev"]))

        return rnd.sample(list(self._fewshot_docs), k)


--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
-import json
+"""
+The LAMBADA dataset: Word prediction requiring a broad discourse context∗
+https://arxiv.org/pdf/1606.06031.pdf
+
+LAMBADA is a dataset to evaluate the capabilities of computational models for text
+understanding by means of a word prediction task. LAMBADA is a collection of narrative
+passages sharing the characteristic that human subjects are able to guess their last
+word if they are exposed to the whole passage, but not if they only see the last
+sentence preceding the target word. To succeed on LAMBADA, computational models
+cannot simply rely on local context, but must be able to keep track of information
+in the broader discourse.
+
+Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
+"""
+import inspect
+import lm_eval.datasets.lambada.lambada
 from lm_eval.base import Task, rf
 from lm_eval.metrics import mean, perplexity
-from lm_eval.utils import sh
-from best_download import download_file
-import os
+
+
+_CITATION = """
+@misc{
+    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 
+    title={The LAMBADA dataset},
+    DOI={10.5281/zenodo.2630551},
+    publisher={Zenodo},
+    year={2016},
+    month={Aug}
+}
+"""


 class LAMBADA(Task):
    VERSION = 0
-    def download(self):
-        sh("mkdir -p data/lambada")
-        try:
-            if not os.path.exists("data/lambada/lambada_test.jsonl"):
-                download_file(
-                    "http://eaidata.bmk.sh/data/lambada_test.jsonl", 
-                    local_file="data/lambada/lambada_test.jsonl",
-                    expected_checksum="4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
-                )
-        except:
-            # fallback - for some reason best_download doesnt work all the time here
-            sh("wget http://eaidata.bmk.sh/data/lambada_test.jsonl -O data/lambada/lambada_test.jsonl")
-            sh('echo "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226  data/lambada/lambada_test.jsonl" | sha256sum --check')
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.lambada.lambada)

    def has_training_docs(self):
        return False
@@ -35,9 +47,7 @@ class LAMBADA(Task):
        pass

    def validation_docs(self):
-        with open("data/lambada/lambada_test.jsonl") as fh:
-            for line in fh:
-                yield json.loads(line)
+        return self.dataset["validation"]

    def test_docs(self):
        pass

--- a/lm_eval/tasks/lambada_cloze.py
+++ b/lm_eval/tasks/lambada_cloze.py
-import json
-from lm_eval.base import Task, rf
-from lm_eval.metrics import mean, perplexity
-from lm_eval.utils import sh
+"""
+The LAMBADA dataset: Word prediction requiring a broad discourse context∗
+https://arxiv.org/pdf/1606.06031.pdf
+
+Cloze-style LAMBADA dataset.
+LAMBADA is a dataset to evaluate the capabilities of computational models for text
+understanding by means of a word prediction task. LAMBADA is a collection of narrative
+passages sharing the characteristic that human subjects are able to guess their last
+word if they are exposed to the whole passage, but not if they only see the last
+sentence preceding the target word. To succeed on LAMBADA, computational models
+cannot simply rely on local context, but must be able to keep track of information
+in the broader discourse.
+
+Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
+"""
 from lm_eval.tasks.lambada import LAMBADA
-from best_download import download_file
+
+
+_CITATION = """
+@misc{
+    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 
+    title={The LAMBADA dataset},
+    DOI={10.5281/zenodo.2630551},
+    publisher={Zenodo},
+    year={2016},
+    month={Aug}
+}
+"""


 class LAMBADA_cloze(LAMBADA):
    VERSION = 0
+
    def doc_to_text(self, doc):
        return doc['text'].rsplit(' ', 1)[0] + " ____. ->"


--- a/lm_eval/tasks/lambada_multilingual.py
+++ b/lm_eval/tasks/lambada_multilingual.py