in-place replace main with lm-eval2, keeping old git history

d2a9b759 · haileyschoelkopf · 814940e8 · d2a9b759 · d2a9b759 · d2a9b759
Commit d2a9b759 authored Apr 19, 2023 by haileyschoelkopf
20 changed files
--- a/examples/configurable_task/sglue_cb.yaml
+++ b/examples/configurable_task/sglue_cb.yaml
+dataset_path: super_glue
+dataset_name: cb
+training_split: train
+validation_split: validation
+doc_to_text: "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe?"
+doc_to_target: "{% set answer_choices = ['Yes', 'No', 'Maybe'] %}{{answer_choices[label]}}"
+metric_list: [
+  [exact_match, mean, true]
+  ]
+# filters: [
+#   ["none", ["take_first"]]
+# ]
+
--- a/lm_eval/api/__init__.py
+++ b/lm_eval/api/__init__.py
+from . import metrics
+
+METRIC_REGISTRY = {
+    "matthews_corrcoef": metrics.matthews_corrcoef,
+    "f1_score": metrics.f1_score,
+    "perplexity": metrics.perplexity,
+    "bleu": metrics.bleu,
+    "chrf": metrics.chrf,
+    "ter": metrics.ter,
+}
+
+AGGREGATION_REGISTRY = {
+    "mean": metrics.mean,
+    "median": metrics.median
+}
\ No newline at end of file
--- a/lm_eval/api/filter.py
+++ b/lm_eval/api/filter.py
+from dataclasses import dataclass
+from typing import List
+
+from lm_eval.api.instance import Instance
+
+class Filter:
+    """
+    Filter classes operate on a per-task level. 
+    They take all model outputs (`instance.resps` for all `task.instances`)
+    across all instances of a task, and perform operations.
+    In a single run, one can configure any number of separate filters or lists of filters.
+
+    """
+
+    def __init__(self):
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+
+    def apply(self, resps):
+        """
+        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
+        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
+        if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
+        [<filtered resps for instance 0>, <filtered resps for instance 1>]
+        """
+        return resps
+        
+@dataclass
+class FilterEnsemble:
+    """
+    FilterEnsemble creates a pipeline applying multiple filters.
+    Its intended usage is to stack multiple post-processing steps in order. 
+    `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each 
+    pipeline separately.
+    """
+    name: str 
+    filters: List[Filter]
+
+    def apply(self, instances: List[Instance]):
+
+        resps = [inst.resps for inst in instances] # operate just on the model responses
+        for f in self.filters:
+            # apply filters in sequence
+            out = f.apply(resps)
+            resps = out # TODO: handle the case where a filter returns multiple "buckets"
+        
+        # add the end results after filtering to filtered_requests of their respective source instances.
+        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
+        for inst, resp in zip(instances, resps):
+            inst.filtered_resps[self.name] = resp
+
+            
+
--- a/lm_eval/api/instance.py
+++ b/lm_eval/api/instance.py
+from dataclasses import dataclass, field
+
+@dataclass
+class Instance:
+    request_type: str = None # TODO: make this an enum?
+    doc: dict = None
+    arguments: tuple = None
+    id_: int = None
+    metadata: tuple = None # TODO: better typehints here
+    resps: list = field(default_factory=list)
+    filtered_resps: dict = field(default_factory=dict)
+
+    task_name: str = None
+    doc_id: str = None
+    repeats: str = None
+
+    def __post_init__(self):
+        self.task_name, self.doc_id, self.repeats = self.metadata
+     
+    @property
+    def args(self):
+        """
+        Returns (string,) where `string` is the string to calculate loglikelihood over
+        """
+        return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
+
+# import abc
+
+# class Instance(abc.ABC):
+#     """
+#     A class used to bind together all necessary information and metadata for 
+#     running forward pass of a model on a specific datapoint. 
+
+#     """
+
+#     # all Instance subclasses have an attribute which is the name of the LM() class function they call to get outputs.
+#     request_type = None
+
+#     def __init__(self, doc, arguments=None, id_=None, metadata=("", None, None)):
+
+#         self.doc = doc # store the document which we're using. this is a dict
+#         self.arguments = arguments
+
+#         # need: task name, doc idx, num. repeats
+#         self.task_name, self.doc_id, self.repeats = metadata
+#         # id_ = idx within a doc's requests
+#         self.id_ = id_
+
+#         # handle repeats internally. should be able to run K times on exact same input/output pair
+#         # self.repeats = repeats
+        
+#         # list containing the returns from each call of the model on this particular set of arguments.
+#         self.resps = []
+#         # filtered_resps should end up a dict, with a different key for each set of filters to apply. calculate results against each key in filtered_resps
+#         self.filtered_resps = {}
+
+#         #TODO: add more info as needed for detailed logging
+
+#     def __repr__(self):
+#         return f"Req_{self.request_type}{self.args}{self.id_}"
+
+@dataclass
+class LoglikelihoodInstance(Instance):
+
+    request_type: str = "loglikelihood"
+
+@dataclass
+class RollingLoglikelihoodInstance(Instance):
+
+    request_type: str = "loglikelihood_rolling"
+
+@dataclass
+class GenerationInstance(Instance):
+
+    request_type: str = "greedy_until"
--- a/lm_eval/metrics.py
+++ b/lm_eval/metrics.py
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
+import abc
+
+from lm_eval import utils
+
+
+class LM(abc.ABC):
+    def __init__(self):
+        """Defines the interface that should be implemented by all LM subclasses.
+        LMs are assumed to take text (strings) as input and yield strings as output
+        (inputs/outputs should be tokenization-agnostic.)
+
+        """
+
+    @abc.abstractmethod
+    def loglikelihood(self, requests):
+        """Compute log-likelihood of generating a continuation from a context.
+        Downstream tasks should attempt to use loglikelihood instead of other
+        LM calls whenever possible.
+
+        :param requests: list
+            A list of pairs (context, continuation)
+            context: str
+                Context string. Implementations of LM must be able to handle an
+                empty context string.
+            continuation: str
+                The continuation over which log likelihood will be calculated. If
+                there is a word boundary, the space should be in the continuation.
+                For example, context="hello" continuation=" world" is correct.
+        :return: list
+            A list of pairs (logprob, isgreedy)
+            logprob: float
+                The log probability of `continuation`
+            isgreedy:
+                Whether `continuation` would be generated by greedy sampling from `context`
+        """
+        pass
+
+    @abc.abstractmethod
+    def loglikelihood_rolling(self, requests):
+        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
+        - We will use the full max context length of the model.
+        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
+        the max context length.
+        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
+          which may simply concatenate multiple documents together.
+        - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
+          multiple chunks, the last input will still a full-sized context.
+          Example:
+            Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
+            Prefix: EOT
+            Max context length: 4
+            Resulting input/prediction pairs:
+
+                INPUT:  EOT   0   1   2
+                PRED:     0   1   2   3
+
+                INPUT:    3   4   5   6
+                PRED:     4   5   6   7
+
+                INPUT:    5   6   7   8
+                PRED:             8   9
+
+          Observe that:
+            1. Each token is predicted exactly once
+            2. For the last pair, we provide the full context, but only score the last two tokens
+
+        :param requests: list
+            A list of strings
+            string: str
+                String for which we are computing per-token loglikelihood
+        :return: list
+            A list of pairs (logprob, isgreedy)
+            logprob: float
+                The log probability of `continuation`
+            isgreedy:
+                Whether `continuation` would be generated by greedy sampling from `context`
+        """
+        pass
+
+    # TODO: Add an optional max length
+    @abc.abstractmethod
+    def greedy_until(self, requests):
+        """Generate greedily until a stopping sequence
+
+        :param requests: list
+            A list of pairs (context, until)
+            context: str
+                Context string
+            until: [str]
+                The string sequences to generate until. These string sequences
+                may each span across multiple tokens, or may be part of one token.
+        :return: list
+            A list of strings continuation
+            continuation: str
+                The generated continuation.
+        """
+        pass
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
+
+
+
+class Sampler: # TODO: make this abstract class?
+
+    def __init__(self, docs, task, fewshot_indices=None, rnd=None):
+
+        self.rnd = rnd
+        assert self.rnd, "must pass rnd to FewShotSampler!"
+
+        self.task = task
+        self.config = task._config
+
+        self.delimiter = self.config.delimiter
+
+        self.docs = docs # HF dataset split, provided by task._fewshot_docs()
+        if fewshot_indices: # subset few-shot docs from 
+            self.docs = self.docs.select(fewshot_indices)
+
+    def get_context(self, doc, num_fewshot):
+
+        # draw an extra fewshot sample if 
+        n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot 
+
+        fewshotex = self.sample(n_samples)
+
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+        
+        labeled_examples = (
+                self.delimiter.join(
+                    [
+                        self.task.doc_to_text(doc) + self.task.doc_to_target(doc)
+                        for doc in selected_docs
+                    ]
+                )
+                + self.delimiter
+            )
+
+        # only returns the fewshot context! Does not append the document, do this outside the object
+        return labeled_examples
+
+    def sample(self, n):
+        """
+        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
+        """
+
+        return self.rnd.sample(self.docs, n)
+
+
+class BalancedSampler(Sampler):
+
+    def sample(self, n):
+        """
+        TODO: this should return approximately class-balanced samples from our fewshot examples. 
+        TODO: what order should they be in?
+        """
+
+        pass
+
+class ManualSampler(Sampler):
+
+    def sample(self, n):
+        """
+
+        """
+        pass 
+
+
+# TODO: how should we do design here? might be better to have a single sampler and pass more kwargs at init. 
+# Depends what's easier for new user to add own functionality on top of
+
+# types of sampler:
+# - class-balanced, randomly shuffled
+# - class-balanced, one particular set of fewshot examples for all evaled instances
+# - hand-specify number of fewshot examples per class?
+# - random, varies per example (check that this is curr. default in old repo)
+# - random, unified per example
+# - enforce a specific fixed fewshot string! (or should we not use this, in favor of including it in prompt template directly)
+
+
+# - user-specified doc indices to restrict fewshot doc options to
+# - user specifies split to use for drawing fewshot instances (TODO: manually prevent this from being same split you eval!)
+# - user specifies a prepended "description"/string to add in front of the (prompted) input
+
+# - user specifies a location to draw fewshot samples from? DO THIS IN TASK CLASS
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
 import abc
-from typing import Iterable
-import numpy as np
-import random
+from dataclasses import dataclass
+
 import re
-import os
-import json
-import hashlib
+import evaluate
+import random
+import itertools
+
 import datasets
-from sqlitedict import SqliteDict
-from tqdm import tqdm
-import torch
-import torch.nn.functional as F
+import numpy as np

-from lm_eval.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
+from lm_eval.api import METRIC_REGISTRY, AGGREGATION_REGISTRY
+from lm_eval.api.instance import LoglikelihoodInstance, RollingLoglikelihoodInstance, GenerationInstance
+from lm_eval.api.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
 from lm_eval import utils
-from abc import abstractmethod
-
-
-class LM(abc.ABC):
-    def __init__(self):
-        self.cache_hook = CacheHook(None)
-
-    @abstractmethod
-    def loglikelihood(self, requests):
-        """Compute log-likelihood of generating a continuation from a context.
-        Downstream tasks should attempt to use loglikelihood instead of other
-        LM calls whenever possible.
-
-        :param requests: list
-            A list of pairs (context, continuation)
-            context: str
-                Context string. Implementations of LM must be able to handle an
-                empty context string.
-            continuation: str
-                The continuation over which log likelihood will be calculated. If
-                there is a word boundary, the space should be in the continuation.
-                For example, context="hello" continuation=" world" is correct.
-        :return: list
-            A list of pairs (logprob, isgreedy)
-            logprob: float
-                The log probability of `continuation`
-            isgreedy:
-                Whether `continuation` would be generated by greedy sampling from `context`
-        """
-        pass
-
-    @abstractmethod
-    def loglikelihood_rolling(self, requests):
-        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
-        - We will use the full max context length of the model.
-        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
-        the max context length.
-        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
-          which may simply concatenate multiple documents together.
-        - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
-          multiple chunks, the last input will still a full-sized context.
-          Example:
-            Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
-            Prefix: EOT
-            Max context length: 4
-            Resulting input/prediction pairs:
-
-                INPUT:  EOT   0   1   2
-                PRED:     0   1   2   3
-
-                INPUT:    3   4   5   6
-                PRED:     4   5   6   7
-
-                INPUT:    5   6   7   8
-                PRED:             8   9
-
-          Observe that:
-            1. Each token is predicted exactly once
-            2. For the last pair, we provide the full context, but only score the last two tokens
-
-        :param requests: list
-            A list of strings
-            string: str
-                String for which we are computing per-toke  loglikelihood
-        :return: list
-            A list of pairs (logprob, isgreedy)
-            logprob: float
-                The log probability of `continuation`
-            isgreedy:
-                Whether `continuation` would be generated by greedy sampling from `context`
-        """
-        pass
-
-    # TODO: Add an optional max length
-    @abstractmethod
-    def greedy_until(self, requests):
-        """Generate greedily until a stopping sequence
-
-        :param requests: list
-            A list of pairs (context, until)
-            context: str
-                Context string
-            until: [str]
-                The string sequences to generate until. These string sequences
-                may each span across multiple tokens, or may be part of one token.
-        :return: list
-            A list of strings continuation
-            continuation: str
-                The generated continuation.
-        """
-        pass

-    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config=None):
-        additional_config = {} if additional_config is None else additional_config
-        args = utils.simple_parse_args_string(arg_string)
-        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        return cls(**args, **args2)
-
-    def set_cache_hook(self, cache_hook):
-        self.cache_hook = cache_hook
-
-
-class BaseLM(LM):
-    @property
-    @abstractmethod
-    def eot_token_id(self):
-        pass
-
-    @property
-    @abstractmethod
-    def max_length(self):
-        pass
-
-    @property
-    @abstractmethod
-    def max_gen_toks(self):
-        pass
-
-    @property
-    @abstractmethod
-    def batch_size(self):
-        pass
-
-    @property
-    @abstractmethod
-    def device(self):
-        pass
-
-    @abstractmethod
-    def tok_encode(self, string: str):
-        pass
-
-    @abstractmethod
-    def tok_decode(self, tokens: Iterable[int]):
-        pass
-
-    @abstractmethod
-    def _model_generate(self, context, max_length, eos_token_id):
-        pass
-
-    @abstractmethod
-    def _model_call(self, inps):
-        """
-        inps: a torch tensor of shape [batch, sequence]
-        the size of sequence may vary from call to call
-
-        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits returned from the model
-        """
-        pass
-
-    # subclass must implement properties vocab_size, eot_token_id, max_gen_toks, batch_size, device, max_length.
-    # TODO: enforce this somehow
-
-    def loglikelihood(self, requests):
-        new_reqs = []
-        for context, continuation in requests:
-            if context == "":
-                # end of text as context
-                context_enc = [self.eot_token_id]
-            else:
-                context_enc = self.tok_encode(context)
-
-            continuation_enc = self.tok_encode(continuation)
-
-            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-
-        return self._loglikelihood_tokens(new_reqs)
-
-    def loglikelihood_rolling(self, requests):
-        # TODO: Implement caching once we've confirmed the perplexity implementation
-        # TODO: automatic batch size detection for vectorization
-
-        loglikelihoods = []
-        for (string,) in tqdm(requests):
-            rolling_token_windows = list(
-                map(
-                    utils.make_disjoint_window,
-                    utils.get_rolling_token_windows(
-                        token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
-                        max_seq_len=self.max_length,
-                        context_len=1,
-                    ),
-                )
-            )
-
-            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
-
-            # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
-            # that
-            string_nll = self._loglikelihood_tokens(
-                rolling_token_windows, disable_tqdm=True
-            )
-
-            # discard is_greedy
-            string_nll = [x[0] for x in string_nll]
-
-            string_nll = sum(string_nll)
-            loglikelihoods.append(string_nll)
-
-        return loglikelihoods
-
-    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
-        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
-        res = []
-
-        def _collate(x):
-            # the negative sign on len(toks) sorts descending - this has a few advantages:
-            # - time estimates will always be over not underestimates, which is more useful for planning
-            # - to know the size of a batch when going through the list, you know the first one is always the batch
-            #   padded context length. this is useful to simplify the batching logic and more importantly to make
-            #   automatic adaptive batches much much easier to implement
-            # - any OOMs will happen right away rather than near the end
-
-            toks = x[1] + x[2]
-            return -len(toks), tuple(toks)
-
-        # TODO: automatic (variable) batch size detection for vectorization
-        re_ord = utils.Reorderer(requests, _collate)
-        for chunk in utils.chunks(
-            tqdm(re_ord.get_reordered(), disable=disable_tqdm), self.batch_size
-        ):
-            inps = []
-            cont_toks_list = []
-            inplens = []
-
-            padding_length = None
-
-            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
-            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
-            # again because vectorizing is annoying
-
-            for _, context_enc, continuation_enc in chunk:
-                # sanity check
-                assert len(context_enc) > 0
-                assert len(continuation_enc) > 0
-                assert len(continuation_enc) <= self.max_length
-
-                # how this all works:
-                #          CTX      CONT
-                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
-                # gpt2    \               \
-                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
-                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
-
-                # when too long to fit in context, truncate from the left
-                inp = torch.tensor(
-                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
-                    dtype=torch.long,
-                ).to(self.device)
-                (inplen,) = inp.shape
-
-                cont = continuation_enc
-
-                # since in _collate we make sure length is descending, the longest is always the first one.
-                padding_length = (
-                    padding_length if padding_length is not None else inplen
-                )
-
-                # pad length from seq to padding_length
-                inp = torch.cat(
-                    [
-                        inp,  # [seq]
-                        torch.zeros(padding_length - inplen, dtype=torch.long).to(
-                            inp.device
-                        ),  # [padding_length - seq]
-                    ],
-                    dim=0,
-                )
-
-                inps.append(inp.unsqueeze(0))  # [1, padding_length]
-                cont_toks_list.append(cont)
-                inplens.append(inplen)
-
-            batched_inps = torch.cat(inps, dim=0)  # [batch, padding_length
-            multi_logits = F.log_softmax(
-                self._model_call(batched_inps), dim=-1
-            ).cpu()  # [batch, padding_length, vocab]
-
-            for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(
-                chunk, multi_logits, inps, inplens, cont_toks_list
-            ):
-
-                # Slice to original seq length
-                contlen = len(cont_toks)
-                logits = logits[inplen - contlen : inplen].unsqueeze(
-                    0
-                )  # [1, seq, vocab]
-
-                # Check if per-token argmax is exactly equal to continuation
-                greedy_tokens = logits.argmax(dim=-1)
-                cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(
-                    0
-                )  # [1, seq]
-                max_equal = (greedy_tokens == cont_toks).all()
-
-                # Obtain log-probs at the corresponding continuation token indices
-                # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
-                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
-                    -1
-                )  # [1, seq]
-
-                # Answer: (log prob, is-exact-match)
-                answer = (float(logits.sum()), bool(max_equal))
-
-                # partial caching
-                if cache_key is not None:
-                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
-
-                res.append(answer)
-
-        return re_ord.get_original(res)
-
-    def greedy_until(self, requests):
-        # TODO: implement fully general `until` that handles until that are
-        #       multiple tokens or that span multiple tokens correctly
-
-        # TODO: extract to TokenizedLM?
-        res = []
-
-        def _collate(x):
-            toks = self.tok_encode(x[0])
-            return len(toks), x[0]
-
-        re_ord = utils.Reorderer(requests, _collate)
-
-        for context, until in tqdm(re_ord.get_reordered()):
-            if isinstance(until, str):
-                until = [until]
-
-            (primary_until,) = self.tok_encode(until[0])
-
-            context_enc = torch.tensor(
-                [self.tok_encode(context)[self.max_gen_toks - self.max_length :]]
-            ).to(self.device)
-
-            cont = self._model_generate(
-                context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until
-            )
-
-            s = self.tok_decode(cont[0].tolist()[context_enc.shape[1] :])
-
-            for term in until:
-                s = s.split(term)[0]
-
-            # partial caching
-            self.cache_hook.add_partial("greedy_until", (context, until), s)
-
-            res.append(s)
-
-        return re_ord.get_original(res)
+from lm_eval.filters import build_filter_ensemble
+from lm_eval.api import samplers
+
+
+@dataclass
+class TaskConfig(dict):
+
+    task_name: str = None
+    dataset_path: str = None
+    dataset_name: str = None
+    training_split: str = None
+    validation_split: str = None
+    test_split: str = None
+    fewshot_split: str = None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+    
+    # TODO: add this as more jinja2 appended to start of jinja2 templates. Should allow users to set vars 
+    # s.t. they can define e.g. {% set question = query %} to map dataset columns to "canonical" names in prompts.
+    template_vars: str = None 
+    doc_to_text: str = None
+    doc_to_target: str = None
+
+    # aggregation: dict = None # TODO: remove, I think these 2 are obsolete w/ current metric_list impl.
+    # higher_is_better: dict = None
+    num_fewshot: int = 0
+    batch_size: int = 1
+    metric_list: str = None
+    gold_alias: str = None
+    output_type: str = "greedy_until"
+    delimiter: str = "\n\n"
+    filters: str = None #TODO: need to make this typehint `list`?
+    normalization: str = None # TODO: add length-normalization of various types, mutual info
+    stop_sequences: list = None # TODO: allow passing of stop sequences to greedy gen.
+
+    def __getitem__(self, item):
+        return getattr(self, item)


 class Task(abc.ABC):
@@ -379,6 +61,7 @@ class Task(abc.ABC):
        {"question": ..., question, answer)
    """

+    VERSION = None
    # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
    # or a path to a custom `datasets` loading script.
    DATASET_PATH: str = None
@@ -386,7 +69,14 @@ class Task(abc.ABC):
    # The name of a subset within `DATASET_PATH`.
    DATASET_NAME: str = None

-    def __init__(self, data_dir=None, cache_dir=None, download_mode=None):
+    OUTPUT_TYPE: str = None
+    def __init__(
+        self,
+        data_dir=None,
+        cache_dir=None,
+        download_mode=None,
+        config=None,
+    ):
        """
        :param data_dir: str
            Stores the path to a local folder containing the `Task`'s data files.
@@ -412,6 +102,17 @@ class Task(abc.ABC):
        self.download(data_dir, cache_dir, download_mode)
        self._training_docs = None
        self._fewshot_docs = None
+        self._instances = None
+
+        self._config = TaskConfig(**config) if config else {}
+
+        if not hasattr(self, "_filters"):
+            self._filters = []
+            for name, components in self._config.get("filters", [["none", ["take_first"]]]):
+                filter_pipeline = build_filter_ensemble(name, components)
+                self._filters.append(filter_pipeline)
+
+        self.sampler = samplers.Sampler(self.training_docs(), self, rnd=random.Random()) # TODO: pass the correct docs in here

    def download(self, data_dir=None, cache_dir=None, download_mode=None):
        """Downloads and returns the task dataset.
@@ -446,21 +147,17 @@ class Task(abc.ABC):
            download_mode=download_mode,
        )

-    def should_decontaminate(self):
-        """Whether this task supports decontamination against model training set."""
-        return False
-
-    @abstractmethod
+    @abc.abstractmethod
    def has_training_docs(self):
        """Whether the task has a training set"""
        pass

-    @abstractmethod
+    @abc.abstractmethod
    def has_validation_docs(self):
        """Whether the task has a validation set"""
        pass

-    @abstractmethod
+    @abc.abstractmethod
    def has_test_docs(self):
        """Whether the task has a test set"""
        pass
@@ -497,6 +194,13 @@ class Task(abc.ABC):
        """
        return doc

+    @property
+    def instances(self):
+        """After calling `task.build_all_requests()`, tasks
+        maintain a list of the dataset instances which will be evaluated.
+        """
+        return self._instances
+
    def fewshot_examples(self, k, rnd):
        if self._training_docs is None:
            self._training_docs = list(self.training_docs())
@@ -509,16 +213,46 @@ class Task(abc.ABC):
        )
        assert False

-    @abstractmethod
+    @abc.abstractmethod
    def doc_to_text(self, doc):
        pass

-    @abstractmethod
+    @abc.abstractmethod
    def doc_to_target(self, doc):
        pass

-    @abstractmethod
-    def construct_requests(self, doc, ctx):
+    def build_all_requests(self, limit=None):
+        """Build a set of Instances for a task, and store them in task.instances"""
+        if self.has_test_docs():
+            docs = self.test_docs()
+        elif self.has_validation_docs():
+            docs = self.validation_docs()
+        else:
+            assert (
+                False
+            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+
+        instances = []
+        for doc_id, doc in enumerate(itertools.islice(docs, 0, limit) if limit else docs):
+            # sample fewshot context
+            fewshot_ctx = self.fewshot_context(
+                doc, self._config.num_fewshot, rnd=random.Random()
+            )
+
+            # TODO: hardcoded for now: # of runs on each input to be 2. # TODO: we should override this if doing greedy gen so users don't waste time+compute
+            inst = self.construct_requests(doc=doc, ctx=fewshot_ctx, metadata=(self._config["task_name"], doc_id, 2))
+
+            if not isinstance(inst, list):
+                inst = [inst]
+
+            instances.extend(inst)
+            
+
+        self._instances = instances
+        assert len(self._instances) != 0, "task.build_requests() did not find any docs!"
+
+    @abc.abstractmethod
+    def construct_requests(self, doc, ctx, **kwargs):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

@@ -528,10 +262,17 @@ class Task(abc.ABC):
            The context string, generated by fewshot_context. This includes the natural
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
+        :param doc_idx: int
+            The index of a document within `self.test_docs()` or `self.validation_docs()`,
+            whichever is the main split used.
+        :param repeats: int
+        TODO: update this docstring
+            The number of times each instance in a dataset is inferred on. Defaults to 1, 
+            can be increased for techniques like majority voting.
        """
        pass

-    @abstractmethod
+    @abc.abstractmethod
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a
        dict where keys are the names of submetrics and values are the values of
@@ -544,7 +285,7 @@ class Task(abc.ABC):
        """
        pass

-    @abstractmethod
+    @abc.abstractmethod
    def aggregation(self):
        """
        :returns: {str: [metric_score] -> float}
@@ -553,7 +294,7 @@ class Task(abc.ABC):
        """
        pass

-    @abstractmethod
+    @abc.abstractmethod
    def higher_is_better(self):
        """
        :returns: {str: bool}
@@ -562,20 +303,8 @@ class Task(abc.ABC):
        """
        pass

-    def fewshot_description(self):
-        import warnings
-
-        warnings.warn(
-            "`fewshot_description` will be removed in futures versions. Pass "
-            "any custom descriptions to the `evaluate` function instead.",
-            DeprecationWarning,
-        )
-        return ""
-
    @utils.positional_deprecated
-    def fewshot_context(
-        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
-    ):
+    def fewshot_context(self, doc, num_fewshot, rnd=None):
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.

@@ -583,35 +312,22 @@ class Task(abc.ABC):
            The document as returned from training_docs, validation_docs, or test_docs.
        :param num_fewshot: int
            The number of fewshot examples to provide in the returned context string.
-        :param provide_description: bool
-            Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
        :param rnd: random.Random
            The pseudo-random number generator used to randomly sample examples.
            WARNING: This is currently a required arg although it's optionalized with a default `None`.
-        :param description: str
-            The task's description that will be prepended to the fewshot examples.
        :returns: str
            The fewshot context.
        """
        assert (
            rnd is not None
        ), "A `random.Random` generator argument must be provided to `rnd`"
-        assert not provide_description, (
-            "The `provide_description` arg will be removed in future versions. To prepend "
-            "a custom description to the context, supply the corresponding string via the "
-            "`description` arg."
-        )
-        if provide_description is not None:
-            # nudge people to not specify it at all
-            print(
-                "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
-            )
-
-        description = description + "\n\n" if description else ""

        if num_fewshot == 0:
            labeled_examples = ""
        else:
+
+            # labeled_examples = self.sampler.get_context(doc, self._config.num_fewshot)
+
            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
            if self.has_training_docs():
                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
@@ -639,21 +355,164 @@ class Task(abc.ABC):
            )

        example = self.doc_to_text(doc)
-        return description + labeled_examples + example
+        return labeled_examples + example
+
+    def apply_filters(self):
+
+        for f in self._filters:
+            f.apply(self._instances)
+
+
+class ConfigurableTask(Task):
+
+    VERSION = "2.0"
+    OUTPUT_TYPE = "greedy_until"
+
+    def __init__(
+        self, data_dir=None, cache_dir=None, download_mode=None, config: dict = None
+    ):
+
+        self._config = TaskConfig(**config)
+        if self._config.dataset_path is not None:
+            self.DATASET_PATH = self._config.dataset_path
+
+        if self._config.dataset_name is not None:
+            self.DATASET_NAME = self._config.dataset_name
+
+        if self._config.metric_list is not None:
+            self._metric_list = {}
+            self._aggregation_list = {}
+            self._higher_is_better = {}
+            for (metric_name, aggregation, higher_is_better) in self._config.metric_list:
+
+                self._aggregation_list[metric_name] = AGGREGATION_REGISTRY[aggregation]
+                self._higher_is_better[metric_name] = higher_is_better
+
+                if metric_name in METRIC_REGISTRY.keys():
+                    self._metric_list[metric_name] = METRIC_REGISTRY[metric_name]
+                else:
+                    try:
+                        metric_object = evaluate.load(metric_name)
+                        self._metric_list[metric_name] = metric_object
+                    except Exception as ex:
+                        raise Warning(
+                            "{} not found in the evaluate library!".format(metric_name),
+                            "Please check https://huggingface.co/evaluate-metric",
+                        )
+
+        self.download(data_dir, cache_dir, download_mode)
+        self._training_docs = None
+        self._fewshot_docs = None
+
+        
+        self._filters = []
+        for name, components in self._config.get("filters", [["none", ["take_first"]]]):
+            filter_pipeline = build_filter_ensemble(name, components)
+            self._filters.append(filter_pipeline)
+
+    def has_training_docs(self):
+        if self._config.training_split is not None:
+            return True
+        else:
+            return False
+
+    def has_validation_docs(self):
+        if self._config.validation_split is not None:
+            return True
+        else:
+            return False
+
+    def has_test_docs(self):
+        if self._config.test_split is not None:
+            return True
+        else:
+            return False
+
+    def training_docs(self):
+        if self._config.training_split is not None:
+            return self.dataset[self._config.training_split]
+
+    def validation_docs(self):
+        if self._config.validation_split is not None:
+            return self.dataset[self._config.validation_split]
+
+    def test_docs(self):
+        if self._config.test_split is not None:
+            return self.dataset[self._config.test_split]
+
+    def _process_doc(self, doc):
+        """
+        Override this to process (detokenize, strip, replace, etc.) individual
+        documents. This can be used in a map over documents of a data split.
+        E.g. `map(self._process_doc, self.dataset["validation"])`
+
+        :return: dict
+            The processed version of the specified `doc`.
+        """
+        return doc
+
+    def doc_to_text(self, doc):
+        return utils.apply_template(self._config.doc_to_text, doc)
+
+    def doc_to_target(self, doc):
+        return utils.apply_template(self._config.doc_to_target, doc)
+
+    def construct_requests(self, doc, ctx, **kwargs):
+
+        if self.OUTPUT_TYPE == "greedy_until":
+            return GenerationInstance(doc=doc, arguments=(ctx, "\n\n"), id_=0, **kwargs)
+
+    def process_results(self, doc, results):
+
+        if self._config.gold_alias is not None:
+            gold = doc[self._config.gold_alias]
+        else:
+            gold = self.doc_to_target(doc)
+
+        result_dict = {}
+        for key, result in zip(self._metric_list.keys(), results):
+            _dict = self._metric_list[key].compute(
+                references=[gold],
+                predictions=[result],
+            )
+
+            result_dict[key] = _dict[key]
+
+        return result_dict
+
+    def aggregation(self):
+
+        return self._aggregation_list
+
+    def higher_is_better(self):
+        
+        return self._higher_is_better_list


 class MultipleChoiceTask(Task):
+
+    OUTPUT_TYPE: str = "loglikelihood"
+
    def doc_to_target(self, doc):
        return " " + doc["choices"][doc["gold"]]

-    def construct_requests(self, doc, ctx):
-        lls = [
-            rf.loglikelihood(ctx, " {}".format(choice))[0] for choice in doc["choices"]
-        ]
+    def construct_requests(self, doc, ctx, **kwargs):
+        
+        return [LoglikelihoodInstance(
+                doc=doc, 
+                arguments=(ctx, " {}".format(choice)),
+                id_=i,
+                **kwargs,
+            )
+            for i, choice in enumerate(doc["choices"])]
+        #lls = [
+        #    rf.loglikelihood(ctx, " {}".format(choice))[0] for choice in doc["choices"]
+        # ]

-        return lls
+        # return lls

    def process_results(self, doc, results):
+        results = [res[0] for res in results] # only retain loglikelihoods, discard is_greedy TODO: do we need is_greedy anywhere? 
        gold = doc["gold"]

        acc = 1.0 if np.argmax(results) == gold else 0.0
@@ -679,9 +538,8 @@ class MultipleChoiceTask(Task):


 class PerplexityTask(Task, abc.ABC):
-    def should_decontaminate(self):
-        """Whether this task supports decontamination against model training set."""
-        return True
+
+    OUTPUT_TYPE = "loglikelihood_rolling"

    def has_training_docs(self):
        return False
@@ -691,7 +549,7 @@ class PerplexityTask(Task, abc.ABC):
        return []

    def fewshot_context(
-        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+        self, doc, num_fewshot, rnd=None
    ):
        assert (
            num_fewshot == 0
@@ -699,16 +557,6 @@ class PerplexityTask(Task, abc.ABC):
        assert (
            rnd is not None
        ), "A `random.Random` generator argument must be provided to `rnd`."
-        assert not provide_description, (
-            "The `provide_description` arg will be removed in future versions. To prepend "
-            "a custom description to the context, supply the corresponding string via the "
-            "`description` arg."
-        )
-        if provide_description is not None:
-            # nudge people to not specify it at all
-            print(
-                "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
-            )

        return ""

@@ -728,10 +576,12 @@ class PerplexityTask(Task, abc.ABC):
    def doc_to_target(self, doc):
        return doc

-    def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx, **kwargs):
        assert not ctx
-        req = rf.loglikelihood_rolling(self.doc_to_target(doc))
-        return req
+
+        return RollingLoglikelihoodInstance(doc=doc, arguments=(self.doc_to_target(doc),), id_=0, **kwargs)
+        # req = rf.loglikelihood_rolling(self.doc_to_target(doc))
+        # return req

    def process_results(self, doc, results):
        (loglikelihood,) = results
@@ -758,134 +608,3 @@ class PerplexityTask(Task, abc.ABC):
    def count_words(cls, doc):
        """Downstream tasks with custom word boundaries should override this!"""
        return len(re.split(r"\s+", doc))
-
-
-def hash_args(attr, args):
-    dat = json.dumps([attr] + list(args))
-    return hashlib.sha256(dat.encode("utf-8")).hexdigest()
-
-
-class CacheHook:
-    def __init__(self, cachinglm):
-        if cachinglm is None:
-            self.dbdict = None
-            return
-
-        self.dbdict = cachinglm.dbdict
-
-    def add_partial(self, attr, req, res):
-        if self.dbdict is None:
-            return
-        hsh = hash_args(attr, req)
-        self.dbdict[hsh] = res
-
-
-class CachingLM:
-    def __init__(self, lm, cache_db):
-        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
-
-        :param lm: LM
-            Underlying LM
-        :param cache_db: str
-            Path to cache db
-        """
-        self.lm = lm
-        self.cache_db = cache_db
-        if os.path.dirname(cache_db):
-            os.makedirs(os.path.dirname(cache_db), exist_ok=True)
-        self.dbdict = SqliteDict(cache_db, autocommit=True)
-
-        # add hook to lm
-        lm.set_cache_hook(self.get_cache_hook())
-
-    def __getattr__(self, attr):
-        def fn(requests):
-            res = []
-            remaining_reqs = []
-
-            # figure out which ones are cached and which ones are new
-            for req in requests:
-                hsh = hash_args(attr, req)
-                if hsh in self.dbdict:
-                    ob = self.dbdict[hsh]
-
-                    assert ob is not None
-
-                    res.append(ob)
-                else:
-                    res.append(None)
-                    remaining_reqs.append(req)
-
-            # actually run the LM on the requests that do not have cached results
-            rem_res = getattr(self.lm, attr)(remaining_reqs)
-
-            # stick the new ones back into the list and also cache any of the new ones
-            resptr = 0
-            for req, r in zip(remaining_reqs, rem_res):
-                while res[resptr] is not None:
-                    resptr += 1
-
-                res[resptr] = r
-
-                # caching
-                hsh = hash_args(attr, req)
-                self.dbdict[hsh] = r
-            self.dbdict.commit()
-
-            return res
-
-        return fn
-
-    def get_cache_hook(self):
-        return CacheHook(self)
-
-
-REQUEST_RETURN_LENGTHS = {
-    "loglikelihood": 2,
-    "greedy_until": None,
-    "loglikelihood_rolling": None,
-}
-
-
-class Request:
-    def __init__(self, request_type, args, index=None):
-        if request_type not in REQUEST_RETURN_LENGTHS.keys():
-            raise NotImplementedError(
-                "The request type {} is not implemented!".format(request_type)
-            )
-
-        self.request_type = request_type
-        self.args = args
-        self.index = index
-
-    def __iter__(self):
-        if REQUEST_RETURN_LENGTHS[self.request_type] is None:
-            raise IndexError("This request type does not return multiple arguments!")
-        for i in range(REQUEST_RETURN_LENGTHS[self.request_type]):
-            yield Request(self.request_type, self.args, i)
-
-    def __getitem__(self, i):
-        if REQUEST_RETURN_LENGTHS[self.request_type] is None:
-            raise IndexError("This request type does not return multiple arguments!")
-        return Request(self.request_type, self.args, i)
-
-    def __eq__(self, other):
-        return (
-            self.request_type == other.request_type
-            and self.args == other.args
-            and self.index == other.index
-        )
-
-    def __repr__(self):
-        return f"Req_{self.request_type}{self.args}[{self.index}]\n"
-
-
-class RequestFactory:
-    def __getattr__(self, attr):
-        def fn(*args):
-            return Request(attr, args)
-
-        return fn
-
-
-rf = RequestFactory()
--- a/lm_eval/datasets/README.md
+++ b/lm_eval/datasets/README.md
-# datasets
-
-This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not.
-
-__NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path.
-
-
-__WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed.
--- a/lm_eval/datasets/__init__.py
+++ b/lm_eval/datasets/__init__.py
--- a/lm_eval/datasets/asdiv/__init__.py
+++ b/lm_eval/datasets/asdiv/__init__.py
--- a/lm_eval/datasets/asdiv/asdiv.py
+++ b/lm_eval/datasets/asdiv/asdiv.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""ASDIV dataset."""
-
-
-import os
-import xml.etree.ElementTree as ET
-
-import datasets
-
-
-_CITATION = """\
-@misc{miao2021diverse,
-    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
-    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
-    year={2021},
-    eprint={2106.15772},
-    archivePrefix={arXiv},
-    primaryClass={cs.AI}
-}
-"""
-
-_DESCRIPTION = """\
-ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
-patterns and problem types) English math word problem (MWP) corpus for evaluating
-the capability of various MWP solvers. Existing MWP corpora for studying AI progress
-remain limited either in language usage patterns or in problem types. We thus present
-a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
-types taught in elementary school. Each MWP is annotated with its problem type and grade
-level (for indicating the level of difficulty).
-"""
-
-_HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-_URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
-
-
-class ASDiv(datasets.GeneratorBasedBuilder):
-    """ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers"""
-
-    VERSION = datasets.Version("0.0.1")
-
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(
-            name="asdiv",
-            version=VERSION,
-            description="A diverse corpus for evaluating and developing english math word problem solvers",
-        )
-    ]
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "body": datasets.Value("string"),
-                "question": datasets.Value("string"),
-                "solution_type": datasets.Value("string"),
-                "answer": datasets.Value("string"),
-                "formula": datasets.Value("string"),
-            }
-        )
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        urls = _URLS
-        data_dir = dl_manager.download_and_extract(urls)
-        base_filepath = "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50"
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, base_filepath, "dataset", "ASDiv.xml"
-                    ),
-                    "split": datasets.Split.VALIDATION,
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        tree = ET.parse(filepath)
-        root = tree.getroot()
-        for key, problem in enumerate(root.iter("Problem")):
-            yield key, {
-                "body": problem.find("Body").text,
-                "question": problem.find("Question").text,
-                "solution_type": problem.find("Solution-Type").text,
-                "answer": problem.find("Answer").text,
-                "formula": problem.find("Formula").text,
-            }
--- a/lm_eval/datasets/asdiv/dataset_infos.json
+++ b/lm_eval/datasets/asdiv/dataset_infos.json
-{"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n    year={2021},\n    eprint={2106.15772},\n    archivePrefix={arXiv},\n    primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}}
--- a/lm_eval/datasets/coqa/__init__.py
+++ b/lm_eval/datasets/coqa/__init__.py
--- a/lm_eval/datasets/coqa/coqa.py
+++ b/lm_eval/datasets/coqa/coqa.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""CoQA dataset.
-
-This `CoQA` adds the "additional_answers" feature that's missing in the original
-datasets version:
-https://github.com/huggingface/datasets/blob/master/datasets/coqa/coqa.py
-"""
-
-
-import json
-
-import datasets
-
-
-_CITATION = """\
-@misc{reddy2018coqa,
-    title={CoQA: A Conversational Question Answering Challenge},
-    author={Siva Reddy and Danqi Chen and Christopher D. Manning},
-    year={2018},
-    eprint={1808.07042},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-"""
-
-_DESCRIPTION = """\
-CoQA is a large-scale dataset for building Conversational Question Answering
-systems. The goal of the CoQA challenge is to measure the ability of machines to
-understand a text passage and answer a series of interconnected questions that
-appear in a conversation.
-"""
-
-_HOMEPAGE = "https://stanfordnlp.github.io/coqa/"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-_URLS = {
-    "train": "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json",
-    "validation": "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json",
-}
-
-# `additional_answers` are not available in the train set so we fill them with
-# empty dicts of the same form.
-_EMPTY_ADDITIONAL_ANSWER = {
-    "0": [
-        {
-            "span_start": -1,
-            "span_end": -1,
-            "span_text": "",
-            "input_text": "",
-            "turn_id": -1,
-        }
-    ],
-    "1": [
-        {
-            "span_start": -1,
-            "span_end": -1,
-            "span_text": "",
-            "input_text": "",
-            "turn_id": -1,
-        }
-    ],
-    "2": [
-        {
-            "span_start": -1,
-            "span_end": -1,
-            "span_text": "",
-            "input_text": "",
-            "turn_id": -1,
-        }
-    ],
-}
-
-
-class Coqa(datasets.GeneratorBasedBuilder):
-    """CoQA is a large-scale dataset for building Conversational Question Answering systems."""
-
-    VERSION = datasets.Version("0.0.1")
-
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(
-            name="coqa", version=VERSION, description="The CoQA dataset."
-        ),
-    ]
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "id": datasets.Value("string"),
-                "source": datasets.Value("string"),
-                "story": datasets.Value("string"),
-                "questions": datasets.features.Sequence(
-                    {
-                        "input_text": datasets.Value("string"),
-                        "turn_id": datasets.Value("int32"),
-                    }
-                ),
-                "answers": datasets.features.Sequence(
-                    {
-                        "span_start": datasets.Value("int32"),
-                        "span_end": datasets.Value("int32"),
-                        "span_text": datasets.Value("string"),
-                        "input_text": datasets.Value("string"),
-                        "turn_id": datasets.Value("int32"),
-                    }
-                ),
-                "additional_answers": {
-                    "0": datasets.features.Sequence(
-                        {
-                            "span_start": datasets.Value("int32"),
-                            "span_end": datasets.Value("int32"),
-                            "span_text": datasets.Value("string"),
-                            "input_text": datasets.Value("string"),
-                            "turn_id": datasets.Value("int32"),
-                        }
-                    ),
-                    "1": datasets.features.Sequence(
-                        {
-                            "span_start": datasets.Value("int32"),
-                            "span_end": datasets.Value("int32"),
-                            "span_text": datasets.Value("string"),
-                            "input_text": datasets.Value("string"),
-                            "turn_id": datasets.Value("int32"),
-                        }
-                    ),
-                    "2": datasets.features.Sequence(
-                        {
-                            "span_start": datasets.Value("int32"),
-                            "span_end": datasets.Value("int32"),
-                            "span_text": datasets.Value("string"),
-                            "input_text": datasets.Value("string"),
-                            "turn_id": datasets.Value("int32"),
-                        }
-                    ),
-                },
-            }
-        )
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
-        data_dirs = dl_manager.download_and_extract(urls)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dirs["train"],
-                    "split": datasets.Split.TRAIN,
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": data_dirs["validation"],
-                    "split": datasets.Split.VALIDATION,
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        with open(filepath, encoding="utf-8") as f:
-            data = json.load(f)
-            for row in data["data"]:
-                id = row["id"]
-                source = row["source"]
-                story = row["story"]
-                questions = [
-                    {"input_text": q["input_text"], "turn_id": q["turn_id"]}
-                    for q in row["questions"]
-                ]
-                answers = [
-                    {
-                        "span_start": a["span_start"],
-                        "span_end": a["span_end"],
-                        "span_text": a["span_text"],
-                        "input_text": a["input_text"],
-                        "turn_id": a["turn_id"],
-                    }
-                    for a in row["answers"]
-                ]
-                if split == datasets.Split.TRAIN:
-                    additional_answers = _EMPTY_ADDITIONAL_ANSWER
-                else:
-                    additional_answers = {
-                        "0": [
-                            {
-                                "span_start": a0["span_start"],
-                                "span_end": a0["span_end"],
-                                "span_text": a0["span_text"],
-                                "input_text": a0["input_text"],
-                                "turn_id": a0["turn_id"],
-                            }
-                            for a0 in row["additional_answers"]["0"]
-                        ],
-                        "1": [
-                            {
-                                "span_start": a1["span_start"],
-                                "span_end": a1["span_end"],
-                                "span_text": a1["span_text"],
-                                "input_text": a1["input_text"],
-                                "turn_id": a1["turn_id"],
-                            }
-                            for a1 in row["additional_answers"]["1"]
-                        ],
-                        "2": [
-                            {
-                                "span_start": a2["span_start"],
-                                "span_end": a2["span_end"],
-                                "span_text": a2["span_text"],
-                                "input_text": a2["input_text"],
-                                "turn_id": a2["turn_id"],
-                            }
-                            for a2 in row["additional_answers"]["2"]
-                        ],
-                    }
-                yield row["id"], {
-                    "id": id,
-                    "story": story,
-                    "source": source,
-                    "questions": questions,
-                    "answers": answers,
-                    "additional_answers": additional_answers,
-                }
--- a/lm_eval/datasets/coqa/dataset_infos.json
+++ b/lm_eval/datasets/coqa/dataset_infos.json
-{"coqa": {"description": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.\n", "citation": "@misc{reddy2018coqa,\n    title={CoQA: A Conversational Question Answering Challenge},\n    author={Siva Reddy and Danqi Chen and Christopher D. Manning},\n    year={2018},\n    eprint={1808.07042},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://stanfordnlp.github.io/coqa/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "story": {"dtype": "string", "id": null, "_type": "Value"}, "questions": {"feature": {"input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_answers": {"0": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "1": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "2": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "coqa", "config_name": "coqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 26250528, "num_examples": 7199, "dataset_name": "coqa"}, "validation": {"name": "validation", "num_bytes": 3765933, "num_examples": 500, "dataset_name": "coqa"}}, "download_checksums": {"https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json": {"num_bytes": 49001836, "checksum": "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6"}, "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json": {"num_bytes": 9090845, "checksum": "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a"}}, "download_size": 58092681, "post_processing_size": null, "dataset_size": 30016461, "size_in_bytes": 88109142}}
--- a/lm_eval/datasets/drop/__init__.py
+++ b/lm_eval/datasets/drop/__init__.py
--- a/lm_eval/datasets/drop/dataset_infos.json
+++ b/lm_eval/datasets/drop/dataset_infos.json
-{"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n    author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n    year={2019},\n    eprint={1903.00161},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}}
--- a/lm_eval/datasets/drop/drop.py
+++ b/lm_eval/datasets/drop/drop.py
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Custom DROP dataset that, unlike HF, keeps all question-answer pairs
-# even if there are multiple types of answers for the same question.
-"""DROP dataset."""
-
-
-import json
-import os
-
-import datasets
-
-
-_CITATION = """\
-@misc{dua2019drop,
-    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
-    author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
-    year={2019},
-    eprint={1903.00161},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-"""
-
-_DESCRIPTION = """\
-DROP is a QA dataset which tests comprehensive understanding of paragraphs. In
-this crowdsourced, adversarially-created, 96k question-answering benchmark, a
-system must resolve multiple references in a question, map them onto a paragraph,
-and perform discrete operations over them (such as addition, counting, or sorting).
-"""
-
-_HOMEPAGE = "https://allenai.org/data/drop"
-
-# TODO: Add the licence for the dataset here if you can find it
-_LICENSE = ""
-
-_URLS = {
-    "drop": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip",
-}
-
-_EMPTY_VALIDATED_ANSWER = [
-    {
-        "number": "",
-        "date": {
-            "day": "",
-            "month": "",
-            "year": "",
-        },
-        "spans": [],
-        "worker_id": "",
-        "hit_id": "",
-    }
-]
-
-
-class Drop(datasets.GeneratorBasedBuilder):
-    """DROP is a QA dataset which tests comprehensive understanding of paragraphs."""
-
-    VERSION = datasets.Version("0.0.1")
-
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(
-            name="drop", version=VERSION, description="The DROP dataset."
-        ),
-    ]
-
-    def _info(self):
-        features = datasets.Features(
-            {
-                "section_id": datasets.Value("string"),
-                "passage": datasets.Value("string"),
-                "question": datasets.Value("string"),
-                "query_id": datasets.Value("string"),
-                "answer": {
-                    "number": datasets.Value("string"),
-                    "date": {
-                        "day": datasets.Value("string"),
-                        "month": datasets.Value("string"),
-                        "year": datasets.Value("string"),
-                    },
-                    "spans": datasets.features.Sequence(datasets.Value("string")),
-                    "worker_id": datasets.Value("string"),
-                    "hit_id": datasets.Value("string"),
-                },
-                "validated_answers": datasets.features.Sequence(
-                    {
-                        "number": datasets.Value("string"),
-                        "date": {
-                            "day": datasets.Value("string"),
-                            "month": datasets.Value("string"),
-                            "year": datasets.Value("string"),
-                        },
-                        "spans": datasets.features.Sequence(datasets.Value("string")),
-                        "worker_id": datasets.Value("string"),
-                        "hit_id": datasets.Value("string"),
-                    }
-                ),
-            }
-        )
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-
-    def _split_generators(self, dl_manager):
-        urls = _URLS[self.config.name]
-        data_dir = dl_manager.download_and_extract(urls)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, "drop_dataset", "drop_dataset_train.json"
-                    ),
-                    "split": "train",
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, "drop_dataset", "drop_dataset_dev.json"
-                    ),
-                    "split": "validation",
-                },
-            ),
-        ]
-
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    def _generate_examples(self, filepath, split):
-        with open(filepath, encoding="utf-8") as f:
-            data = json.load(f)
-            key = 0
-            for section_id, example in data.items():
-                # Each example (passage) has multiple sub-question-answer pairs.
-                for qa in example["qa_pairs"]:
-                    # Build answer.
-                    answer = qa["answer"]
-                    answer = {
-                        "number": answer["number"],
-                        "date": {
-                            "day": answer["date"].get("day", ""),
-                            "month": answer["date"].get("month", ""),
-                            "year": answer["date"].get("year", ""),
-                        },
-                        "spans": answer["spans"],
-                        "worker_id": answer.get("worker_id", ""),
-                        "hit_id": answer.get("hit_id", ""),
-                    }
-                    validated_answers = []
-                    if "validated_answers" in qa:
-                        for validated_answer in qa["validated_answers"]:
-                            va = {
-                                "number": validated_answer.get("number", ""),
-                                "date": {
-                                    "day": validated_answer["date"].get("day", ""),
-                                    "month": validated_answer["date"].get("month", ""),
-                                    "year": validated_answer["date"].get("year", ""),
-                                },
-                                "spans": validated_answer.get("spans", ""),
-                                "worker_id": validated_answer.get("worker_id", ""),
-                                "hit_id": validated_answer.get("hit_id", ""),
-                            }
-                            validated_answers.append(va)
-                    else:
-                        validated_answers = _EMPTY_VALIDATED_ANSWER
-                    yield key, {
-                        "section_id": section_id,
-                        "passage": example["passage"],
-                        "question": qa["question"],
-                        "query_id": qa["query_id"],
-                        "answer": answer,
-                        "validated_answers": validated_answers,
-                    }
-                    key += 1
--- a/lm_eval/datasets/headqa/__init__.py
+++ b/lm_eval/datasets/headqa/__init__.py