in-place replace main with lm-eval2, keeping old git history

d2a9b759 · haileyschoelkopf · 814940e8 · d2a9b759 · d2a9b759 · d2a9b759
Commit d2a9b759 authored Apr 19, 2023 by haileyschoelkopf
20 changed files
--- a/examples/configurable_task/sglue_cb.yaml
+++ b/examples/configurable_task/sglue_cb.yaml
+dataset_path: super_glue
+dataset_name: cb
+training_split: train
+validation_split: validation
+doc_to_text: "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe?"
+doc_to_target: "{% set answer_choices = ['Yes', 'No', 'Maybe'] %}{{answer_choices[label]}}"
+metric_list: [
+  [exact_match, mean, true]
+  ]
+# filters: [
+#   ["none", ["take_first"]]
+# ]
+
--- a/lm_eval/api/__init__.py
+++ b/lm_eval/api/__init__.py
+from . import metrics
+
+METRIC_REGISTRY = {
+    "matthews_corrcoef": metrics.matthews_corrcoef,
+    "f1_score": metrics.f1_score,
+    "perplexity": metrics.perplexity,
+    "bleu": metrics.bleu,
+    "chrf": metrics.chrf,
+    "ter": metrics.ter,
+}
+
+AGGREGATION_REGISTRY = {
+    "mean": metrics.mean,
+    "median": metrics.median
+}
\ No newline at end of file
--- a/lm_eval/api/filter.py
+++ b/lm_eval/api/filter.py
+from dataclasses import dataclass
+from typing import List
+
+from lm_eval.api.instance import Instance
+
+class Filter:
+    """
+    Filter classes operate on a per-task level. 
+    They take all model outputs (`instance.resps` for all `task.instances`)
+    across all instances of a task, and perform operations.
+    In a single run, one can configure any number of separate filters or lists of filters.
+
+    """
+
+    def __init__(self):
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+
+    def apply(self, resps):
+        """
+        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
+        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
+        if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
+        [<filtered resps for instance 0>, <filtered resps for instance 1>]
+        """
+        return resps
+        
+@dataclass
+class FilterEnsemble:
+    """
+    FilterEnsemble creates a pipeline applying multiple filters.
+    Its intended usage is to stack multiple post-processing steps in order. 
+    `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each 
+    pipeline separately.
+    """
+    name: str 
+    filters: List[Filter]
+
+    def apply(self, instances: List[Instance]):
+
+        resps = [inst.resps for inst in instances] # operate just on the model responses
+        for f in self.filters:
+            # apply filters in sequence
+            out = f.apply(resps)
+            resps = out # TODO: handle the case where a filter returns multiple "buckets"
+        
+        # add the end results after filtering to filtered_requests of their respective source instances.
+        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
+        for inst, resp in zip(instances, resps):
+            inst.filtered_resps[self.name] = resp
+
+            
+
--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
+from dataclasses import dataclass, field
+
+@dataclass
+class Instance:
+    request_type: str = None # TODO: make this an enum?
+    doc: dict = None
+    arguments: tuple = None
+    id_: int = None
+    metadata: tuple = None # TODO: better typehints here
+    resps: list = field(default_factory=list)
+    filtered_resps: dict = field(default_factory=dict)
+
+    task_name: str = None
+    doc_id: str = None
+    repeats: str = None
+
+    def __post_init__(self):
+        self.task_name, self.doc_id, self.repeats = self.metadata
+     
+    @property
+    def args(self):
+        """
+        Returns (string,) where `string` is the string to calculate loglikelihood over
+        """
+        return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
+
+# import abc
+
+# class Instance(abc.ABC):
+#     """
+#     A class used to bind together all necessary information and metadata for 
+#     running forward pass of a model on a specific datapoint. 
+
+#     """
+
+#     # all Instance subclasses have an attribute which is the name of the LM() class function they call to get outputs.
+#     request_type = None
+
+#     def __init__(self, doc, arguments=None, id_=None, metadata=("", None, None)):
+
+#         self.doc = doc # store the document which we're using. this is a dict
+#         self.arguments = arguments
+
+#         # need: task name, doc idx, num. repeats
+#         self.task_name, self.doc_id, self.repeats = metadata
+#         # id_ = idx within a doc's requests
+#         self.id_ = id_
+
+#         # handle repeats internally. should be able to run K times on exact same input/output pair
+#         # self.repeats = repeats
+        
+#         # list containing the returns from each call of the model on this particular set of arguments.
+#         self.resps = []
+#         # filtered_resps should end up a dict, with a different key for each set of filters to apply. calculate results against each key in filtered_resps
+#         self.filtered_resps = {}
+
+#         #TODO: add more info as needed for detailed logging
+
+#     def __repr__(self):
+#         return f"Req_{self.request_type}{self.args}{self.id_}"
+
+@dataclass
+class LoglikelihoodInstance(Instance):
+
+    request_type: str = "loglikelihood"
+
+@dataclass
+class RollingLoglikelihoodInstance(Instance):
+
+    request_type: str = "loglikelihood_rolling"
+
+@dataclass
+class GenerationInstance(Instance):
+
+    request_type: str = "greedy_until"
--- a/lm_eval/metrics.py
+++ b/lm_eval/metrics.py
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
+import abc
+
+from lm_eval import utils
+
+
+class LM(abc.ABC):
+    def __init__(self):
+        """Defines the interface that should be implemented by all LM subclasses.
+        LMs are assumed to take text (strings) as input and yield strings as output
+        (inputs/outputs should be tokenization-agnostic.)
+
+        """
+
+    @abc.abstractmethod
+    def loglikelihood(self, requests):
+        """Compute log-likelihood of generating a continuation from a context.
+        Downstream tasks should attempt to use loglikelihood instead of other
+        LM calls whenever possible.
+
+        :param requests: list
+            A list of pairs (context, continuation)
+            context: str
+                Context string. Implementations of LM must be able to handle an
+                empty context string.
+            continuation: str
+                The continuation over which log likelihood will be calculated. If
+                there is a word boundary, the space should be in the continuation.
+                For example, context="hello" continuation=" world" is correct.
+        :return: list
+            A list of pairs (logprob, isgreedy)
+            logprob: float
+                The log probability of `continuation`
+            isgreedy:
+                Whether `continuation` would be generated by greedy sampling from `context`
+        """
+        pass
+
+    @abc.abstractmethod
+    def loglikelihood_rolling(self, requests):
+        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
+        - We will use the full max context length of the model.
+        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
+        the max context length.
+        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
+          which may simply concatenate multiple documents together.
+        - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
+          multiple chunks, the last input will still a full-sized context.
+          Example:
+            Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
+            Prefix: EOT
+            Max context length: 4
+            Resulting input/prediction pairs:
+
+                INPUT:  EOT   0   1   2
+                PRED:     0   1   2   3
+
+                INPUT:    3   4   5   6
+                PRED:     4   5   6   7
+
+                INPUT:    5   6   7   8
+                PRED:             8   9
+
+          Observe that:
+            1. Each token is predicted exactly once
+            2. For the last pair, we provide the full context, but only score the last two tokens
+
+        :param requests: list
+            A list of strings
+            string: str
+                String for which we are computing per-token loglikelihood
+        :return: list
+            A list of pairs (logprob, isgreedy)
+            logprob: float
+                The log probability of `continuation`
+            isgreedy:
+                Whether `continuation` would be generated by greedy sampling from `context`
+        """
+        pass
+
+    # TODO: Add an optional max length
+    @abc.abstractmethod
+    def greedy_until(self, requests):
+        """Generate greedily until a stopping sequence
+
+        :param requests: list
+            A list of pairs (context, until)
+            context: str
+                Context string
+            until: [str]
+                The string sequences to generate until. These string sequences
+                may each span across multiple tokens, or may be part of one token.
+        :return: list
+            A list of strings continuation
+            continuation: str
+                The generated continuation.
+        """
+        pass
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
+
+
+
+class Sampler: # TODO: make this abstract class?
+
+    def __init__(self, docs, task, fewshot_indices=None, rnd=None):
+
+        self.rnd = rnd
+        assert self.rnd, "must pass rnd to FewShotSampler!"
+
+        self.task = task
+        self.config = task._config
+
+        self.delimiter = self.config.delimiter
+
+        self.docs = docs # HF dataset split, provided by task._fewshot_docs()
+        if fewshot_indices: # subset few-shot docs from 
+            self.docs = self.docs.select(fewshot_indices)
+
+    def get_context(self, doc, num_fewshot):
+
+        # draw an extra fewshot sample if 
+        n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot 
+
+        fewshotex = self.sample(n_samples)
+
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+        
+        labeled_examples = (
+                self.delimiter.join(
+                    [
+                        self.task.doc_to_text(doc) + self.task.doc_to_target(doc)
+                        for doc in selected_docs
+                    ]
+                )
+                + self.delimiter
+            )
+
+        # only returns the fewshot context! Does not append the document, do this outside the object
+        return labeled_examples
+
+    def sample(self, n):
+        """
+        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
+        """
+
+        return self.rnd.sample(self.docs, n)
+
+
+class BalancedSampler(Sampler):
+
+    def sample(self, n):
+        """
+        TODO: this should return approximately class-balanced samples from our fewshot examples. 
+        TODO: what order should they be in?
+        """
+
+        pass
+
+class ManualSampler(Sampler):
+
+    def sample(self, n):
+        """
+
+        """
+        pass 
+
+
+# TODO: how should we do design here? might be better to have a single sampler and pass more kwargs at init. 
+# Depends what's easier for new user to add own functionality on top of
+
+# types of sampler:
+# - class-balanced, randomly shuffled
+# - class-balanced, one particular set of fewshot examples for all evaled instances
+# - hand-specify number of fewshot examples per class?
+# - random, varies per example (check that this is curr. default in old repo)
+# - random, unified per example
+# - enforce a specific fixed fewshot string! (or should we not use this, in favor of including it in prompt template directly)
+
+
+# - user-specified doc indices to restrict fewshot doc options to
+# - user specifies split to use for drawing fewshot instances (TODO: manually prevent this from being same split you eval!)
+# - user specifies a prepended "description"/string to add in front of the (prompted) input
+
+# - user specifies a location to draw fewshot samples from? DO THIS IN TASK CLASS
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
--- a/lm_eval/datasets/README.md
+++ b/lm_eval/datasets/README.md
-# datasets
-
-This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not.
-
-__NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path.
-
-
-__WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed.
--- a/lm_eval/datasets/__init__.py
+++ b/lm_eval/datasets/__init__.py
--- a/lm_eval/datasets/asdiv/__init__.py
+++ b/lm_eval/datasets/asdiv/__init__.py
--- a/lm_eval/datasets/asdiv/asdiv.py
+++ b/lm_eval/datasets/asdiv/asdiv.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""ASDIV dataset."""
-
-
-import os
-import xml.etree.ElementTree as ET
-
-import datasets
-
-
-_CITATION = """\
-@misc{miao2021diverse,
-    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
-    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
-    year={2021},
-    eprint={2106.15772},
-    archivePrefix={arXiv},
-    primaryClass={cs.AI}
-}
-"""
-
-_DESCRIPTION = """\
-ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
-patterns and problem types) English math word problem (MWP) corpus for evaluating
-the capability of various MWP solvers. Existing MWP corpora for studying AI progress
-remain limited either in language usage patterns or in problem types. We thus present
-a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
-types taught in elementary school. Each MWP is annotated with its problem type and grade
-level (for indicating the level of difficulty).
-"""
-
-_HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-_URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
-
-
-class ASDiv(datasets.GeneratorBasedBuilder):
-    """ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers"""
-
-    VERSION = datasets.Version("0.0.1")
-
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(
-            name="asdiv",
-            version=VERSION,
-            description="A diverse corpus for evaluating and developing english math word problem solvers",
-        )
-    ]
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "body": datasets.Value("string"),
-                "question": datasets.Value("string"),
-                "solution_type": datasets.Value("string"),
-                "answer": datasets.Value("string"),
-                "formula": datasets.Value("string"),
-            }
-        )
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        urls = _URLS
-        data_dir = dl_manager.download_and_extract(urls)
-        base_filepath = "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50"
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, base_filepath, "dataset", "ASDiv.xml"
-                    ),
-                    "split": datasets.Split.VALIDATION,
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        tree = ET.parse(filepath)
-        root = tree.getroot()
-        for key, problem in enumerate(root.iter("Problem")):
-            yield key, {
-                "body": problem.find("Body").text,
-                "question": problem.find("Question").text,
-                "solution_type": problem.find("Solution-Type").text,
-                "answer": problem.find("Answer").text,
-                "formula": problem.find("Formula").text,
-            }
--- a/lm_eval/datasets/asdiv/dataset_infos.json
+++ b/lm_eval/datasets/asdiv/dataset_infos.json
-{"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n    year={2021},\n    eprint={2106.15772},\n    archivePrefix={arXiv},\n    primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}}
--- a/lm_eval/datasets/coqa/__init__.py
+++ b/lm_eval/datasets/coqa/__init__.py
--- a/lm_eval/datasets/coqa/coqa.py
+++ b/lm_eval/datasets/coqa/coqa.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""CoQA dataset.
-
-This `CoQA` adds the "additional_answers" feature that's missing in the original
-datasets version:
-https://github.com/huggingface/datasets/blob/master/datasets/coqa/coqa.py
-"""
-
-
-import json
-
-import datasets
-
-
-_CITATION = """\
-@misc{reddy2018coqa,
-    title={CoQA: A Conversational Question Answering Challenge},
-    author={Siva Reddy and Danqi Chen and Christopher D. Manning},
-    year={2018},
-    eprint={1808.07042},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-"""
-
-_DESCRIPTION = """\
-CoQA is a large-scale dataset for building Conversational Question Answering
-systems. The goal of the CoQA challenge is to measure the ability of machines to
-understand a text passage and answer a series of interconnected questions that
-appear in a conversation.
-"""
-
-_HOMEPAGE = "https://stanfordnlp.github.io/coqa/"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-_URLS = {
-    "train": "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json",
-    "validation": "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json",
-}
-
-# `additional_answers` are not available in the train set so we fill them with
-# empty dicts of the same form.
-_EMPTY_ADDITIONAL_ANSWER = {
-    "0": [
-        {
-            "span_start": -1,
-            "span_end": -1,
-            "span_text": "",
-            "input_text": "",
-            "turn_id": -1,
-        }
-    ],
-    "1": [
-        {
-            "span_start": -1,
-            "span_end": -1,
-            "span_text": "",
-            "input_text": "",
-            "turn_id": -1,
-        }
-    ],
-    "2": [
-        {
-            "span_start": -1,
-            "span_end": -1,
-            "span_text": "",
-            "input_text": "",
-            "turn_id": -1,
-        }
-    ],
-}
-
-
-class Coqa(datasets.GeneratorBasedBuilder):
-    """CoQA is a large-scale dataset for building Conversational Question Answering systems."""
-
-    VERSION = datasets.Version("0.0.1")
-
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(
-            name="coqa", version=VERSION, description="The CoQA dataset."
-        ),
-    ]
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "id": datasets.Value("string"),
-                "source": datasets.Value("string"),
-                "story": datasets.Value("string"),
-                "questions": datasets.features.Sequence(
-                    {
-                        "input_text": datasets.Value("string"),
-                        "turn_id": datasets.Value("int32"),
-                    }
-                ),
-                "answers": datasets.features.Sequence(
-                    {
-                        "span_start": datasets.Value("int32"),
-                        "span_end": datasets.Value("int32"),
-                        "span_text": datasets.Value("string"),
-                        "input_text": datasets.Value("string"),
-                        "turn_id": datasets.Value("int32"),
-                    }
-                ),
-                "additional_answers": {
-                    "0": datasets.features.Sequence(
-                        {
-                            "span_start": datasets.Value("int32"),
-                            "span_end": datasets.Value("int32"),
-                            "span_text": datasets.Value("string"),
-                            "input_text": datasets.Value("string"),
-                            "turn_id": datasets.Value("int32"),
-                        }
-                    ),
-                    "1": datasets.features.Sequence(
-                        {
-                            "span_start": datasets.Value("int32"),
-                            "span_end": datasets.Value("int32"),
-                            "span_text": datasets.Value("string"),
-                            "input_text": datasets.Value("string"),
-                            "turn_id": datasets.Value("int32"),
-                        }
-                    ),
-                    "2": datasets.features.Sequence(
-                        {
-                            "span_start": datasets.Value("int32"),
-                            "span_end": datasets.Value("int32"),
-                            "span_text": datasets.Value("string"),
-                            "input_text": datasets.Value("string"),
-                            "turn_id": datasets.Value("int32"),
-                        }
-                    ),
-                },
-            }
-        )
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
-        data_dirs = dl_manager.download_and_extract(urls)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dirs["train"],
-                    "split": datasets.Split.TRAIN,
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dirs["validation"],
-                    "split": datasets.Split.VALIDATION,
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        with open(filepath, encoding="utf-8") as f:
-            data = json.load(f)
-            for row in data["data"]:
-                id = row["id"]
-                source = row["source"]
-                story = row["story"]
-                questions = [
-                    {"input_text": q["input_text"], "turn_id": q["turn_id"]}
-                    for q in row["questions"]
-                ]
-                answers = [
-                    {
-                        "span_start": a["span_start"],
-                        "span_end": a["span_end"],
-                        "span_text": a["span_text"],
-                        "input_text": a["input_text"],
-                        "turn_id": a["turn_id"],
-                    }
-                    for a in row["answers"]
-                ]
-                if split == datasets.Split.TRAIN:
-                    additional_answers = _EMPTY_ADDITIONAL_ANSWER
-                else:
-                    additional_answers = {
-                        "0": [
-                            {
-                                "span_start": a0["span_start"],
-                                "span_end": a0["span_end"],
-                                "span_text": a0["span_text"],
-                                "input_text": a0["input_text"],
-                                "turn_id": a0["turn_id"],
-                            }
-                            for a0 in row["additional_answers"]["0"]
-                        ],
-                        "1": [
-                            {
-                                "span_start": a1["span_start"],
-                                "span_end": a1["span_end"],
-                                "span_text": a1["span_text"],
-                                "input_text": a1["input_text"],
-                                "turn_id": a1["turn_id"],
-                            }
-                            for a1 in row["additional_answers"]["1"]
-                        ],
-                        "2": [
-                            {
-                                "span_start": a2["span_start"],
-                                "span_end": a2["span_end"],
-                                "span_text": a2["span_text"],
-                                "input_text": a2["input_text"],
-                                "turn_id": a2["turn_id"],
-                            }
-                            for a2 in row["additional_answers"]["2"]
-                        ],
-                    }
-                yield row["id"], {
-                    "id": id,
-                    "story": story,
-                    "source": source,
-                    "questions": questions,
-                    "answers": answers,
-                    "additional_answers": additional_answers,
-                }
--- a/lm_eval/datasets/coqa/dataset_infos.json
+++ b/lm_eval/datasets/coqa/dataset_infos.json
-{"coqa": {"description": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.\n", "citation": "@misc{reddy2018coqa,\n    title={CoQA: A Conversational Question Answering Challenge},\n    author={Siva Reddy and Danqi Chen and Christopher D. Manning},\n    year={2018},\n    eprint={1808.07042},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://stanfordnlp.github.io/coqa/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "story": {"dtype": "string", "id": null, "_type": "Value"}, "questions": {"feature": {"input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_answers": {"0": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "1": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "2": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "coqa", "config_name": "coqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 26250528, "num_examples": 7199, "dataset_name": "coqa"}, "validation": {"name": "validation", "num_bytes": 3765933, "num_examples": 500, "dataset_name": "coqa"}}, "download_checksums": {"https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json": {"num_bytes": 49001836, "checksum": "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6"}, "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json": {"num_bytes": 9090845, "checksum": "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a"}}, "download_size": 58092681, "post_processing_size": null, "dataset_size": 30016461, "size_in_bytes": 88109142}}
--- a/lm_eval/datasets/drop/__init__.py
+++ b/lm_eval/datasets/drop/__init__.py
--- a/lm_eval/datasets/drop/dataset_infos.json
+++ b/lm_eval/datasets/drop/dataset_infos.json
-{"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n    author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n    year={2019},\n    eprint={1903.00161},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}}
--- a/lm_eval/datasets/drop/drop.py
+++ b/lm_eval/datasets/drop/drop.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Custom DROP dataset that, unlike HF, keeps all question-answer pairs
-# even if there are multiple types of answers for the same question.
-"""DROP dataset."""
-
-
-import json
-import os
-
-import datasets
-
-
-_CITATION = """\
-@misc{dua2019drop,
-    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
-    author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
-    year={2019},
-    eprint={1903.00161},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-"""
-
-_DESCRIPTION = """\
-DROP is a QA dataset which tests comprehensive understanding of paragraphs. In
-this crowdsourced, adversarially-created, 96k question-answering benchmark, a
-system must resolve multiple references in a question, map them onto a paragraph,
-and perform discrete operations over them (such as addition, counting, or sorting).
-"""
-
-_HOMEPAGE = "https://allenai.org/data/drop"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-_URLS = {
-    "drop": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip",
-}
-
-_EMPTY_VALIDATED_ANSWER = [
-    {
-        "number": "",
-        "date": {
-            "day": "",
-            "month": "",
-            "year": "",
-        },
-        "spans": [],
-        "worker_id": "",
-        "hit_id": "",
-    }
-]
-
-
-class Drop(datasets.GeneratorBasedBuilder):
-    """DROP is a QA dataset which tests comprehensive understanding of paragraphs."""
-
-    VERSION = datasets.Version("0.0.1")
-
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(
-            name="drop", version=VERSION, description="The DROP dataset."
-        ),
-    ]
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "section_id": datasets.Value("string"),
-                "passage": datasets.Value("string"),
-                "question": datasets.Value("string"),
-                "query_id": datasets.Value("string"),
-                "answer": {
-                    "number": datasets.Value("string"),
-                    "date": {
-                        "day": datasets.Value("string"),
-                        "month": datasets.Value("string"),
-                        "year": datasets.Value("string"),
-                    },
-                    "spans": datasets.features.Sequence(datasets.Value("string")),
-                    "worker_id": datasets.Value("string"),
-                    "hit_id": datasets.Value("string"),
-                },
-                "validated_answers": datasets.features.Sequence(
-                    {
-                        "number": datasets.Value("string"),
-                        "date": {
-                            "day": datasets.Value("string"),
-                            "month": datasets.Value("string"),
-                            "year": datasets.Value("string"),
-                        },
-                        "spans": datasets.features.Sequence(datasets.Value("string")),
-                        "worker_id": datasets.Value("string"),
-                        "hit_id": datasets.Value("string"),
-                    }
-                ),
-            }
-        )
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        urls = _URLS[self.config.name]
-        data_dir = dl_manager.download_and_extract(urls)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, "drop_dataset", "drop_dataset_train.json"
-                    ),
-                    "split": "train",
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, "drop_dataset", "drop_dataset_dev.json"
-                    ),
-                    "split": "validation",
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        with open(filepath, encoding="utf-8") as f:
-            data = json.load(f)
-            key = 0
-            for section_id, example in data.items():
-                # Each example (passage) has multiple sub-question-answer pairs.
-                for qa in example["qa_pairs"]:
-                    # Build answer.
-                    answer = qa["answer"]
-                    answer = {
-                        "number": answer["number"],
-                        "date": {
-                            "day": answer["date"].get("day", ""),
-                            "month": answer["date"].get("month", ""),
-                            "year": answer["date"].get("year", ""),
-                        },
-                        "spans": answer["spans"],
-                        "worker_id": answer.get("worker_id", ""),
-                        "hit_id": answer.get("hit_id", ""),
-                    }
-                    validated_answers = []
-                    if "validated_answers" in qa:
-                        for validated_answer in qa["validated_answers"]:
-                            va = {
-                                "number": validated_answer.get("number", ""),
-                                "date": {
-                                    "day": validated_answer["date"].get("day", ""),
-                                    "month": validated_answer["date"].get("month", ""),
-                                    "year": validated_answer["date"].get("year", ""),
-                                },
-                                "spans": validated_answer.get("spans", ""),
-                                "worker_id": validated_answer.get("worker_id", ""),
-                                "hit_id": validated_answer.get("hit_id", ""),
-                            }
-                            validated_answers.append(va)
-                    else:
-                        validated_answers = _EMPTY_VALIDATED_ANSWER
-                    yield key, {
-                        "section_id": section_id,
-                        "passage": example["passage"],
-                        "question": qa["question"],
-                        "query_id": qa["query_id"],
-                        "answer": answer,
-                        "validated_answers": validated_answers,
-                    }
-                    key += 1
--- a/lm_eval/datasets/headqa/__init__.py
+++ b/lm_eval/datasets/headqa/__init__.py