Merge pull request #316 from jon-tow/master

Revert "Merge branch 'master' into master"

Merge pull request #316 from jon-tow/master
Revert "Merge branch 'master' into master"
cf074822 · Stella Biderman · GitHub · 5fe7e2c0 · 7585ec56 · cf074822
Unverified Commit cf074822 authored Apr 29, 2022 by Stella Biderman Committed by GitHub Apr 29, 2022
20 changed files
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -2,38 +2,25 @@ import collections
 import itertools
 import pathlib
 import random
 import lm_eval.metrics
 import lm_eval.models
 import lm_eval.tasks
 import lm_eval.base
-import promptsource
 import numpy as np
-from promptsource.templates import DatasetTemplates
 from lm_eval.utils import positional_deprecated, run_task_tests
 @positional_deprecated
-def simple_evaluate(
+def simple_evaluate(model, model_args=None, tasks=[],
-    model,
+                    num_fewshot=0, batch_size=None, device=None,
-    model_args=None,
+                    no_cache=False, limit=None, bootstrap_iters=100000,
-    tasks=[],
+                    description_dict=None, check_integrity=False):
-    num_fewshot=0,
-    batch_size=None,
-    device=None,
-    no_cache=False,
-    limit=None,
-    bootstrap_iters=100000,
-    description_dict=None,
-    check_integrity=False,
-):
    """Instantiate and evaluate a model on a list of tasks.
    :param model: Union[str, LM]
        Name of model or LM object, see lm_eval.models.get_model
    :param model_args: Optional[str]
-        String arguments for each model class, see LM.create_from_arg_string.
+        String arguments for each model class, see LM.create_from_arg_string. 
        Ignored if `model` argument is a LM object.
    :param tasks: list[Union[str, Task]]
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
@@ -50,7 +37,7 @@ def simple_evaluate(
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
    :param description_dict: dict[str, str]
-        Dictionary of custom task descriptions of the form: `task_name: description`
+        Dictionary of custom task descriptions of the form: `task_name: description` 
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
    :return
@@ -62,28 +49,20 @@ def simple_evaluate(
    assert tasks != [], "No tasks specified"
    if isinstance(model, str):
-        if model_args is None:
+        if model_args is None: model_args = ""
-            model_args = ""
+        lm = lm_eval.models.get_model(model).create_from_arg_string(model_args, {
-        lm = lm_eval.models.get_model(model).create_from_arg_string(
+            'batch_size': batch_size, 'device': device
-            model_args, {"batch_size": batch_size, "device": device}
+        })
-        )
    else:
        assert isinstance(model, lm_eval.base.LM)
        lm = model
-    # TODO: Hard-code turning off cache while testing. Remove once testing is completed.
-    no_cache = True
    if not no_cache:
        lm = lm_eval.base.CachingLM(
-            lm,
+            lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db'
-            "lm_cache/"
-            + model
-            + "_"
-            + model_args.replace("=", "-").replace(",", "_").replace("/", "-")
-            + ".db",
        )
-    task_dict = lm_eval.tasks.get_task_dict_promptsource(tasks)
+    task_dict = lm_eval.tasks.get_task_dict(tasks)
    if check_integrity:
        run_task_tests(task_list=tasks)
@@ -93,7 +72,7 @@ def simple_evaluate(
        task_dict=task_dict,
        num_fewshot=num_fewshot,
        limit=limit,
-        description_dict=description_dict,
+        description_dict=description_dict
    )
    # add info about the model and few shot config
@@ -106,22 +85,14 @@ def simple_evaluate(
        "no_cache": no_cache,
        "limit": limit,
        "bootstrap_iters": bootstrap_iters,
-        "description_dict": description_dict,
+        "description_dict": description_dict
    }
    return results
 @positional_deprecated
-def evaluate(
+def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None, bootstrap_iters=100000, description_dict=None):
-    lm,
-    task_dict,
-    provide_description=None,
-    num_fewshot=0,
-    limit=None,
-    bootstrap_iters=100000,
-    description_dict=None,
-):
    """Instantiate and evaluate a model on a list of tasks.
    :param lm: obj
@@ -137,7 +108,7 @@ def evaluate(
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
    :param description_dict: dict[str, str]
-        Dictionary of custom task descriptions of the form: `task_name: description`
+        Dictionary of custom task descriptions of the form: `task_name: description` 
    :return
        Dictionary of results
    """
@@ -147,14 +118,12 @@ def evaluate(
    assert not provide_description  # not implemented.
    if provide_description is not None:
        # nudge people to not specify it at all
-        print(
+        print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
-            "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
-        )
    task_dict_items = [
        (name, task)
        for name, task in task_dict.items()
-        if (task.has_validation_docs() or task.has_test_docs())
+        if(task.has_validation_docs() or task.has_test_docs())
    ]
    results = collections.defaultdict(dict)
@@ -172,8 +141,8 @@ def evaluate(
    docs = {}
    # get lists of each type of request
-    for task_prompt_name, task in task_dict_items:
+    for task_name, task in task_dict_items:
-        versions[task_prompt_name] = task.VERSION
+        versions[task_name] = task.VERSION
        # default to test doc, fall back to val doc if validation unavailable
        # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
        if task.has_test_docs():
@@ -184,39 +153,29 @@ def evaluate(
            raise RuntimeError("Task has neither test_docs nor validation_docs")
        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
-        task_docs = list(enumerate(list(task_doc_func())))
+        task_docs = list(task_doc_func())
        rnd = random.Random()
        rnd.seed(42)
        rnd.shuffle(task_docs)
-        description = (
+        description = description_dict[task_name] if description_dict and task_name in description_dict else ""
-            description_dict[task_prompt_name]
-            if description_dict and task_prompt_name in description_dict
-            else ""
-        )
-        for doc_id, (original_doc_id, doc) in enumerate(
-            itertools.islice(task_docs, 0, limit)
-        ):
-            if task.invalid_doc_for_prompt(doc):
-                continue
-            docs[(task_prompt_name, doc_id)] = doc
+        for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
-            ctx, fewshotex_logging_info = task.fewshot_context(
+            docs[(task_name, doc_id)] = doc
-                doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
+            ctx = task.fewshot_context(
+                doc=doc,
+                num_fewshot=num_fewshot,
+                rnd=rnd,
+                description=description
            )
-            fewshotex_logging_info["doc_id"] = original_doc_id
+            reqs = task.construct_requests(doc, ctx)
-            args = {"num_fewshot": num_fewshot}
-            reqs = task.construct_requests(doc, ctx, args)
            if not isinstance(reqs, (list, tuple)):
                reqs = [reqs]
            for i, req in enumerate(reqs):
                requests[req.request_type].append(req)
                # i: index in requests for a single task instance
                # doc_id: unique id that we can get back to a doc using `docs`
-                requests_origin[req.request_type].append(
+                requests_origin[req.request_type].append((i, task_name, doc, doc_id))
-                    (i, task_prompt_name, doc, doc_id, fewshotex_logging_info)
-                )
    # all responses for each (task, doc)
    process_res_queue = collections.defaultdict(list)
@@ -230,82 +189,42 @@ def evaluate(
        print("Running", reqtype, "requests")
        resps = getattr(lm, reqtype)([req.args for req in reqs])
-        resps = [
+        resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
-            x if req.index is None else x[req.index] for x, req in zip(resps, reqs)
-        ]
-        for resp, (i, task_prompt_name, doc, doc_id, fewshotex_logging_info) in zip(
-            resps, requests_origin[reqtype]
-        ):
-            process_res_queue[(task_prompt_name, doc_id)].append(
-                (i, resp, fewshotex_logging_info)
-            )
+        for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
+            process_res_queue[(task_name, doc_id)].append((i, resp))
    vals = collections.defaultdict(list)
    # unpack results and sort back in order and return control to Task
-    examples = []
+    for (task_name, doc_id), requests in process_res_queue.items():
-    for (task_prompt_name, doc_id), per_doc_requests in process_res_queue.items():
+        requests.sort(key=lambda x: x[0])
-        per_doc_requests.sort(key=lambda x: x[0])
+        requests = [x[1] for x in requests]
-        per_doc_results = [x[1] for x in per_doc_requests]
-        fewshot_logging_info = [x[2] for x in per_doc_requests][0]
-        task = task_dict[task_prompt_name]
-        doc = docs[(task_prompt_name, doc_id)]
-        output = task.process_results(doc, per_doc_results)
-        if task.save_examples:
-            metrics, example = output
-            example.update(fewshot_logging_info)
-            example.update(task.get_logging_info())
-            examples.append(example)
-        else:
-            metrics = output
-            example = fewshot_logging_info
-            example.update(task.get_logging_info())
-            examples.append(example)
-        for metric, value in metrics.items():
+        task = task_dict[task_name]
-            vals[(task_prompt_name, metric)].append(value)
+        doc = docs[(task_name, doc_id)]
+        metrics = task.process_results(doc, requests)
+        for metric, value in metrics.items():
+            vals[(task_name, metric)].append(value)
    # aggregate results
-    metric_results = []
+    for (task_name, metric), items in vals.items():
-    for (task_prompt_name, metric), items in vals.items():
+        task = task_dict[task_name]
-        task_name, prompt_name = task_prompt_name.split("+")
+        results[task_name][metric] = task.aggregation()[metric](items)
-        results[task_prompt_name]["task_name"] = task_name
-        results[task_prompt_name]["prompt_name"] = prompt_name
-        task = task_dict[task_prompt_name]
-        results[task_prompt_name][metric] = task.aggregation()[metric](items)
-        _metric_results = {
-            "task_name": task_name,
-            "prompt_name": prompt_name,
-            metric: task.aggregation()[metric](items),
-            **task.get_logging_info(),
-        }
        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
        # so we run them less iterations. still looking for a cleaner way to do this
        stderr = lm_eval.metrics.stderr_for_metric(
            metric=task.aggregation()[metric],
-            bootstrap_iters=min(bootstrap_iters, 1000)
+            bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
-            if metric in ["bleu", "chrf", "ter"]
-            else bootstrap_iters,
        )
        if stderr is not None:
-            results[task_prompt_name][metric + "_stderr"] = stderr(items)
+            results[task_name][metric + "_stderr"] = stderr(items)
-            _metric_results[metric + "_stderr"] = stderr(items)
-        metric_results.append(_metric_results)
    return {
-        # List of results that tracks the averages per model and prompt.
+        "results": dict(results),
-        "results": metric_results,
+        "versions": dict(versions)
-        "versions": dict(versions),
-        # List of all prompt x doc examples with additional information in it.
-        "examples": examples,
-        # Original results used for generating the table when running this file.
-        "table_results": dict(results),
    }
@@ -315,50 +234,22 @@ def make_table(result_dict):
    md_writer = MarkdownTableWriter()
    latex_writer = LatexTableWriter()
-    md_writer.headers = ["Task", "Prompt", "Version", "Metric", "Value", "", "Stderr"]
+    md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
-    latex_writer.headers = [
+    latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
-        "Task",
-        "Prompt",
-        "Version",
-        "Metric",
-        "Value",
-        "",
-        "Stderr",
-    ]
    values = []
-    for k, dic in result_dict["table_results"].items():
+    for k, dic in result_dict["results"].items():
        version = result_dict["versions"][k]
        for m, v in dic.items():
            if m.endswith("_stderr"):
                continue
-            if "_name" in m:
-                continue
            if m + "_stderr" in dic:
                se = dic[m + "_stderr"]
-                values.append(
+                values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
-                    [
-                        dic["task_name"],
-                        dic["prompt_name"],
-                        version,
-                        m,
-                        "%.4f" % v,
-                        "±",
-                        "%.4f" % se,
-                    ]
-                )
            else:
-                values.append(
+                values.append([k, version, m, '%.4f' % v, '', ''])
-                    [
-                        dic["task_name"],
-                        dic["prompt_name"],
-                        version,
-                        m,
-                        "%.4f" % v,
-                        "",
-                        "",
-                    ]
-                )
            k = ""
            version = ""
    md_writer.value_matrix = values

--- a/lm_eval/metrics.py
+++ b/lm_eval/metrics.py
-import typing
 import math
 from collections.abc import Iterable
 import numpy as np
 import sacrebleu
-from rouge_score import rouge_scorer
 import sklearn.metrics
 import random
@@ -186,74 +184,6 @@ def _sacreformat(refs, preds):
    return refs, preds
-def rouge(
-    refs: typing.List[str],
-    pred: str,
-    rouge_types: typing.List[str] = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
-):
-    """ ROUGE with multi-reference support
-    Implementation based on GEM-metrics:
-    https://github.com/GEM-benchmark/GEM-metrics/blob/431a8174bd6b3637e8d6118bfad2983e39e99733/gem_metrics/rouge.py
-    :param refs:
-        A `list` of reference `str`s.
-    :param pred:
-        A single prediction `str`s.
-    """
-    # Add newlines between sentences to correctly compute `rougeLsum`.
-    if "rougeLsum" in rouge_types:
-        # TODO: Adapt this to handle languages that do not support sentence endings by `.`.
-        # See GEM-metrics implementation with lang specific `nltk` tokenizers to
-        # split sentences.
-        pred = pred.replace(".", ".\n")
-        refs = [ref.replace(".", ".\n") for ref in refs]
-    scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=True)
-    # ROUGE multi-ref jackknifing
-    if len(refs) > 1:
-        cur_scores = [scorer.score(ref, pred) for ref in refs]
-        # get best score for all leave-one-out sets
-        best_scores = []
-        for leave in range(len(refs)):
-            cur_scores_leave_one = [
-                cur_scores[s] for s in range(len(refs)) if s != leave
-            ]
-            best_scores.append(
-                {
-                    rouge_type: max(
-                        [s[rouge_type] for s in cur_scores_leave_one],
-                        key=lambda s: s.fmeasure,
-                    )
-                    for rouge_type in rouge_types
-                }
-            )
-        # average the leave-one-out bests to produce the final score
-        score = {
-            rouge_type: rouge_scorer.scoring.Score(
-                np.mean([b[rouge_type].precision for b in best_scores]),
-                np.mean([b[rouge_type].recall for b in best_scores]),
-                np.mean([b[rouge_type].fmeasure for b in best_scores]),
-            )
-            for rouge_type in rouge_types
-        }
-    else:
-        score = scorer.score(refs[0], pred)
-    # convert the named tuples to plain nested dicts
-    score = {
-        rouge_type: {
-            "precision": score[rouge_type].precision,
-            "recall": score[rouge_type].recall,
-            "fmeasure": score[rouge_type].fmeasure,
-        }
-        for rouge_type in rouge_types
-    }
-    return score
 # stderr stuff
 class _bootstrap_internal:

--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
 from . import gpt2
-from . import gptj
 from . import gpt3
-from . import t5
-from . import t0
 from . import dummy
 MODEL_REGISTRY = {
    "hf": gpt2.HFLM,
    "gpt2": gpt2.GPT2LM,
-    "gptj": gptj.GPTJLM,
    "gpt3": gpt3.GPT3LM,
-    "t5": t5.T5LM,
-    "mt5": t5.T5LM,
-    "t0": t0.T0LM,
    "dummy": dummy.DummyLM,
 }

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -4,16 +4,8 @@ from lm_eval.base import BaseLM
 class HFLM(BaseLM):
-    def __init__(
-        self,
+    def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
-        device="cuda",
-        pretrained="gpt2",
-        revision="main",
-        subfolder=None,
-        tokenizer=None,
-        batch_size=1,
-        parallelize=False
-    ):
        super().__init__()
        assert isinstance(device, str)
@@ -23,61 +15,36 @@ class HFLM(BaseLM):
        if device:
            self._device = torch.device(device)
        else:
-            self._device = (
+            self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-                torch.device("cuda")
-                if torch.cuda.is_available()
-                else torch.device("cpu")
-            )
        # TODO: update this to be less of a hack once subfolder is fixed in HF
        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
-            pretrained,
+            pretrained, revision=revision + ("/" + subfolder if subfolder is not None else "")
-            revision=revision + ("/" + subfolder if subfolder is not None else ""),
+        ).to(self.device)
-        )
        self.gpt2.eval()
        # pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-            pretrained if tokenizer is None else tokenizer,
+            pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
-            revision=revision,
-            subfolder=subfolder,
-        )
-        assert isinstance(
+        assert isinstance(self.tokenizer, (
-            self.tokenizer,
+            transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
-            (
+            transformers.T5Tokenizer, transformers.T5TokenizerFast,
-                transformers.GPT2Tokenizer,
+        )), "this tokenizer has not been checked for compatibility yet!"
-                transformers.GPT2TokenizerFast,
-                transformers.T5Tokenizer,
-                transformers.T5TokenizerFast,
-            ),
-        ), "this tokenizer has not been checked for compatibility yet!"
        self.vocab_size = self.tokenizer.vocab_size
-        if isinstance(
+        if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
-            self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)
+            assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373], \
-        ):
+                self.tokenizer.encode('hello\n\nhello')
-            assert self.tokenizer.encode("hello\n\nhello") == [
-                31373,
-                198,
-                198,
-                31373,
-            ], self.tokenizer.encode("hello\n\nhello")
        # multithreading and batching
        self.batch_size_per_gpu = batch_size  # todo: adaptive batch size
        # TODO: fix multi-gpu
-        if parallelize:
+        # gpus = torch.cuda.device_count()
-            self.gpt2.parallelize()
+        # if gpus > 1:
-            self._device = torch.device('cuda:0')
+        #     self.gpt2 = nn.DataParallel(self.gpt2)
-        else:
-            self.gpt2.to(self._device)
-    @property
-    def eot_token(self):
-        return self.tokenizer.eos_token
    @property
    def eot_token_id(self):
@@ -108,7 +75,7 @@ class HFLM(BaseLM):
    def tok_encode(self, string: str):
        return self.tokenizer.encode(string, add_special_tokens=False)
    def tok_decode(self, tokens):
        return self.tokenizer.decode(tokens)
@@ -122,53 +89,15 @@ class HFLM(BaseLM):
        """
        with torch.no_grad():
            return self.gpt2(inps)[0][:, :, :50257]
+    def _model_generate(self, context, max_length, eos_token_id):
+        return self.gpt2.generate(
+            context,
+            max_length=max_length,
+            eos_token_id=eos_token_id,
+            do_sample=False
+        )
-    def _get_stopping_criteria(self, stopping_criteria_ids):
-        class MultitokenEOSCriteria(transformers.StoppingCriteria):
-            def __init__(self, eos_seq_id: torch.LongTensor, tokenizer):
-                self.eos_seq = tokenizer.decode(eos_seq_id)
-                self.eos_seq_id = eos_seq_id
-                self.eos_seq_len = len(eos_seq_id) + 1
-                self.tokenizer = tokenizer
-            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-                last_token_id = input_ids[0, -self.eos_seq_len:]
-                last_tokens = self.tokenizer.decode(last_token_id)
-                is_stopped = self.eos_seq in last_tokens
-                return is_stopped
-        class EOSCriteria(transformers.StoppingCriteria):
-            def __init__(self, eos_token_id: torch.LongTensor):
-                self.eos_token_id = eos_token_id
-            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-                return input_ids[0,-1] == self.eos_token_id
-        return transformers.StoppingCriteriaList([
-            MultitokenEOSCriteria(stopping_criteria_ids, self.tokenizer),
-            EOSCriteria(self.tokenizer.eos_token)
-        ])
-    def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
-        stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
-        max_length = max_length + context.size(1)
-        if num_fewshot == 0:
-            generations = self.gpt2.generate(
-                context, 
-                max_length=max_length, 
-                eos_token_id=self.eot_token_id,
-                do_sample=False,
-            )
-        else:
-            generations = self.gpt2.generate(
-                context, 
-                max_length=max_length, 
-                stopping_criteria=stopping_criteria,
-                do_sample=False,
-            )
-        # Remove the context from the generations
-        return generations[0, context.shape[1] :]
 # for backwards compatibility
 GPT2LM = HFLM
--- a/lm_eval/models/gptj.py
+++ b/lm_eval/models/gptj.py
-import transformers
-import torch
-from lm_eval.base import BaseLM
-class GPTJLM(BaseLM):
-    def __init__(
-        self,
-        device="cuda",
-        batch_size=1,
-        parallelize=False,
-    ):
-        super().__init__()
-        assert isinstance(device, str)
-        assert isinstance(batch_size, int)
-        if device:
-            self._device = torch.device(device)
-        else:
-            self._device = (
-                torch.device("cuda")
-                if torch.cuda.is_available()
-                else torch.device("cpu")
-            )
-        pretrained = "EleutherAI/gpt-j-6B"
-        self.gptj = transformers.AutoModelForCausalLM.from_pretrained(pretrained).to(self.device)
-        self.gptj.eval()
-        # pretrained tokenizer for neo is broken for now so just hard-coding this to gptj
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
-        self.vocab_size = self.tokenizer.vocab_size
-        # multithreading and batching
-        self.batch_size_per_gpu = batch_size  # todo: adaptive batch size
-        # TODO: fix multi-gpu
-        if parallelize:
-            self.gptj.parallelize()
-            self._device = torch.device('cuda:0')
-        else:
-            self.gptj.to(self._device)
-    @property
-    def eot_token(self):
-        return self.tokenizer.eos_token
-    @property
-    def eot_token_id(self):
-        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
-        return self.tokenizer.eos_token_id
-    @property
-    def max_length(self):
-        try:
-            return self.gptj.config.n_ctx
-        except AttributeError:
-            # gptneoconfig doesn't have n_ctx apparently
-            return self.gptj.config.max_position_embeddings
-    @property
-    def max_gen_toks(self):
-        return 256
-    @property
-    def batch_size(self):
-        # TODO: fix multi-gpu
-        return self.batch_size_per_gpu  # * gpus
-    @property
-    def device(self):
-        # TODO: fix multi-gpu
-        return self._device
-    def tok_encode(self, string: str):
-        return self.tokenizer.encode(string, add_special_tokens=False)
-    def tok_decode(self, tokens):
-        return self.tokenizer.decode(tokens)
-    def _model_call(self, inps):
-        """
-        inps: a torch tensor of shape [batch, sequence]
-        the size of sequence may vary from call to call
-        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits returned from the model
-        """
-        with torch.no_grad():
-            return self.gptj(inps)[0][:, :, :50257]
-    def _get_stopping_criteria(self, stopping_criteria_ids):
-        class MultitokenEOSCriteria(transformers.StoppingCriteria):
-            def __init__(self, eos_seq_id: torch.LongTensor, tokenizer):
-                self.eos_seq = tokenizer.decode(eos_seq_id)
-                self.eos_seq_id = eos_seq_id
-                self.eos_seq_len = len(eos_seq_id) + 1
-                self.tokenizer = tokenizer
-            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-                last_token_id = input_ids[0, -self.eos_seq_len:]
-                last_tokens = self.tokenizer.decode(last_token_id)
-                is_stopped = self.eos_seq in last_tokens
-                return is_stopped
-        class EOSCriteria(transformers.StoppingCriteria):
-            def __init__(self, eos_token_id: torch.LongTensor):
-                self.eos_token_id = eos_token_id
-            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-                return input_ids[0,-1] == self.eos_token_id
-        return transformers.StoppingCriteriaList([
-            MultitokenEOSCriteria(stopping_criteria_ids, self.tokenizer),
-            EOSCriteria(self.tokenizer.eos_token)
-        ])
-    def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
-        stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
-        max_length = max_length + context.size(1)
-        if num_fewshot == 0:
-            generations = self.gptj.generate(
-                context, 
-                max_length=max_length, 
-                eos_token_id=self.eot_token_id,
-                do_sample=False,
-            )
-        else:
-            generations = self.gptj.generate(
-                context, 
-                max_length=max_length, 
-                stopping_criteria=stopping_criteria,
-                do_sample=False,
-            )
-        # Remove the context from the generations
-        return generations[0, context.shape[1] :]
--- a/lm_eval/models/t0.py
+++ b/lm_eval/models/t0.py
-import transformers
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from lm_eval.base import BaseLM
-from lm_eval import utils
-from tqdm import tqdm
-import numpy as np
-import math
-class T0LM(BaseLM):
-    # MAX_GEN_TOKS = 256
-    # MAX_INP_LENGTH = 512
-    # VOCAB_SIZE = 32100
-    # EOT_TOKEN_ID = 1
-    def __init__(self, device='cuda', parallelize=False, pretrained='t0', batch_size=1):
-        super().__init__()
-        if device:
-            self._device = torch.device(device)
-        else:
-            self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        self.t0 = transformers.AutoModelForSeq2SeqLM.from_pretrained(pretrained)
-        self.t0.eval()
-        if parallelize == "True":
-            self.t0.parallelize()
-            self._device = torch.device('cuda:0')
-        else:
-            self.t0.to(self._device)
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
-        # self.max_length = self.MAX_INP_LENGTH
-        self.batch_size = int(batch_size)
-    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config={}):
-        args = utils.simple_parse_args_string(arg_string)
-        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        return cls(**args, **args2)
-    @property
-    def eot_token(self):
-        return self.tokenizer.eos_token
-    @property
-    def eot_token_id(self):
-        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
-        return self.tokenizer.eos_token_id
-    @property
-    def max_length(self):
-        return self.tokenizer.model_max_length
-    @property
-    def max_gen_toks(self):
-        return 256
-    @property
-    def batch_size(self):
-        # TODO: fix multi-gpu
-        return self._batch_size  # * gpus
-    @property
-    def device(self):
-        # TODO: fix multi-gpu
-        return self._device
-    def tok_encode(self, string: str):
-        return self.tokenizer.encode(string, add_special_tokens=False)
-    def tok_decode(self, tokens):
-        return self.tokenizer.decode(tokens)
-    def _model_call(self, inputs_tok, targets_tok):
-        """
-        inps: a torch tensor of shape [batch, sequence]
-        the size of sequence may vary from call to call
-        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits returned from the model
-        """
-        with torch.no_grad():
-            return self.t0(
-                **inputs_tok,
-                labels=targets_tok["input_ids"]
-            )
-    def loglikelihood(self, requests):
-        res = []
-        for chunk in tqdm(utils.chunks(requests, self.batch_size), total=math.ceil(len(requests)/self.batch_size)):
-            inputs, targets = zip(*chunk)
-            # Fill in empty encoder inputs with eos_token
-            inputs = (
-                f"{self.eot_token}" 
-                if len(input_) == 0
-                else input_
-                for input_ in inputs
-            )
-            inputs_tok = self.tokenizer(
-                list(inputs),
-                max_length=self.max_length,
-                padding=True,
-                # truncation=True,
-                add_special_tokens=False,
-                return_tensors="pt"
-                ).to(self.device)
-            for key in inputs_tok:
-                inputs_tok[key] = inputs_tok[key][:, -(self.max_length - 1) :]
-            targets_tok = self.tokenizer(
-                list(targets),
-                max_length=self.max_gen_toks,
-                padding=True,
-                # truncation=True,
-                add_special_tokens=False,
-                return_tensors="pt"
-                ).to(self.device)
-            for key in targets_tok:
-                targets_tok[key] = targets_tok[key][:, -(self.max_length - 1) :]
-            outputs = self._model_call(inputs_tok, targets_tok)
-            log_softmaxes = F.log_softmax(outputs.logits, dim=-1)
-            output_iterator = zip(
-                chunk,
-                log_softmaxes,
-                targets_tok["input_ids"],
-                targets_tok["attention_mask"],
-            )
-            for cache_key, log_softmax, target_tok, target_mask in output_iterator:
-                length = target_mask.sum()
-                log_softmax = log_softmax[:length]
-                target_tok = target_tok[:length]
-                greedy_tokens = log_softmax.argmax(dim=-1)
-                max_equal = (greedy_tokens == target_tok).all()
-                target_logits = torch.gather(
-                    log_softmax, 1, target_tok.unsqueeze(-1)
-                ).squeeze(-1)
-                answer = (float(target_logits.sum()), bool(max_equal))
-                if cache_key is not None:
-                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
-                res.append(answer)
-        return res
-    def _get_stopping_criteria(self, stopping_criteria_ids):
-        class MultitokenEOSCriteria(transformers.StoppingCriteria):
-            def __init__(self, eos_seq_id: torch.LongTensor, tokenizer):
-                self.eos_seq = tokenizer.decode(eos_seq_id)
-                self.eos_seq_id = eos_seq_id
-                self.eos_seq_len = len(eos_seq_id) + 1
-                self.tokenizer = tokenizer
-            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-                last_token_id = input_ids[0, -self.eos_seq_len:]
-                last_tokens = self.tokenizer.decode(last_token_id)
-                is_stopped = self.eos_seq in last_tokens
-                return is_stopped
-        class EOSCriteria(transformers.StoppingCriteria):
-            def __init__(self, eos_token_id: torch.LongTensor):
-                self.eos_token_id = eos_token_id
-            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-                return input_ids[0,-1] == self.eos_token_id
-        return transformers.StoppingCriteriaList([
-            MultitokenEOSCriteria(stopping_criteria_ids, self.tokenizer),
-            EOSCriteria(self.tokenizer.eos_token)
-        ])
-    def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
-        stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
-        if num_fewshot == 0:
-            generations = self.t0.generate(
-                context, 
-                max_length=max_length, 
-                eos_token_id=self.eot_token_id,
-                do_sample=False,
-            )
-        else:
-            generations = self.t0.generate(
-                context, 
-                max_length=max_length, 
-                stopping_criteria=stopping_criteria,
-                do_sample=False,
-            )
-        return generations[0]
--- a/lm_eval/models/t5.py
+++ b/lm_eval/models/t5.py
-import transformers
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from lm_eval.base import BaseLM
-from lm_eval import utils
-from tqdm import tqdm
-import numpy as np
-import math
-class T5LM(BaseLM):
-    # MAX_GEN_TOKS = 256
-    # MAX_INP_LENGTH = 512
-    # VOCAB_SIZE = 32128
-    # EOT_TOKEN_ID = 1
-    def __init__(
-        self, 
-        device='cuda', 
-        parallelize=False, 
-        pretrained='t5', 
-        batch_size=1
-    ):
-        super().__init__()
-        if device:
-            self._device = torch.device(device)
-        else:
-            self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        self.t5 = transformers.AutoModelForSeq2SeqLM.from_pretrained(pretrained)
-        self.t5.eval()
-        if parallelize == "True":
-            self.t5.parallelize()
-            self._device = torch.device('cuda:0')
-        else:
-            self.t5.to(self._device)
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
-        # self.max_length = self.MAX_INP_LENGTH
-        self._batch_size = int(batch_size)
-    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config={}):
-        args = utils.simple_parse_args_string(arg_string)
-        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        return cls(**args, **args2)
-    @property
-    def eot_token(self):
-        return self.tokenizer.eos_token
-    @property
-    def eot_token_id(self):
-        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
-        return self.tokenizer.eos_token_id
-    @property
-    def max_length(self):
-        return self.tokenizer.model_max_length
-    @property
-    def max_gen_toks(self):
-        return 256
-    @property
-    def batch_size(self):
-        # TODO: fix multi-gpu
-        return self._batch_size  # * gpus
-    @property
-    def device(self):
-        # TODO: fix multi-gpu
-        return self._device
-    def tok_encode(self, string: str):
-        return self.tokenizer.encode(string, add_special_tokens=False)
-    def tok_decode(self, tokens):
-        return self.tokenizer.decode(tokens)
-    def _model_call(self, inputs_tok, targets_tok):
-        """
-        inps: a torch tensor of shape [batch, sequence]
-        the size of sequence may vary from call to call
-        returns: a torch tensor of shape [batch, sequence, vocab] with the
-        logits returned from the model
-        """
-        with torch.no_grad():
-            return self.t5(
-                **inputs_tok,
-                labels=targets_tok["input_ids"]
-            )
-    def loglikelihood(self, requests):
-        res = []
-        for chunk in tqdm(utils.chunks(requests, self.batch_size), total=math.ceil(len(requests)/self.batch_size)):
-            inputs, targets = zip(*chunk)
-            # Fill in empty encoder inputs with eos_token
-            inputs = (
-                f"{self.eot_token}" 
-                if len(input_) == 0
-                else input_
-                for input_ in inputs
-            )
-            inputs_tok = self.tokenizer(
-                list(inputs),
-                max_length=self.max_length,
-                padding=True,
-                # truncation=True,
-                add_special_tokens=False,
-                return_tensors="pt"
-                ).to(self.device)
-            for key in inputs_tok:
-                inputs_tok[key] = inputs_tok[key][:, -(self.max_length - 1) :]
-            targets_tok = self.tokenizer(
-                list(targets),
-                max_length=self.max_gen_toks,
-                padding=True,
-                # truncation=True,
-                add_special_tokens=False,
-                return_tensors="pt"
-                ).to(self.device)
-            for key in targets_tok:
-                targets_tok[key] = targets_tok[key][:, -(self.max_length - 1) :]
-            outputs = self._model_call(inputs_tok, targets_tok)
-            log_softmaxes = F.log_softmax(outputs.logits, dim=-1)
-            output_iterator = zip(
-                chunk,
-                log_softmaxes,
-                targets_tok["input_ids"],
-                targets_tok["attention_mask"],
-            )
-            for cache_key, log_softmax, target_tok, target_mask in output_iterator:
-                length = target_mask.sum()
-                log_softmax = log_softmax[:length]
-                target_tok = target_tok[:length]
-                greedy_tokens = log_softmax.argmax(dim=-1)
-                max_equal = (greedy_tokens == target_tok).all()
-                target_logits = torch.gather(
-                    log_softmax, 1, target_tok.unsqueeze(-1)
-                ).squeeze(-1)
-                answer = (float(target_logits.sum()), bool(max_equal))
-                if cache_key is not None:
-                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
-                res.append(answer)
-        return res
-    def _get_stopping_criteria(self, stopping_criteria_ids):
-        class MultitokenEOSCriteria(transformers.StoppingCriteria):
-            def __init__(self, eos_seq_id: torch.LongTensor, tokenizer):
-                self.eos_seq = tokenizer.decode(eos_seq_id)
-                self.eos_seq_id = eos_seq_id
-                self.eos_seq_len = len(eos_seq_id) + 1
-                self.tokenizer = tokenizer
-            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-                last_token_id = input_ids[0, -self.eos_seq_len:]
-                last_tokens = self.tokenizer.decode(last_token_id)
-                is_stopped = self.eos_seq in last_tokens
-                return is_stopped
-        class EOSCriteria(transformers.StoppingCriteria):
-            def __init__(self, eos_token_id: torch.LongTensor):
-                self.eos_token_id = eos_token_id
-            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-                return input_ids[0,-1] == self.eos_token_id
-        return transformers.StoppingCriteriaList([
-            MultitokenEOSCriteria(stopping_criteria_ids, self.tokenizer),
-            EOSCriteria(self.tokenizer.eos_token)
-        ])
-    def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
-        stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
-        if num_fewshot == 0:
-            generations = self.t5.generate(
-                context, 
-                max_length=max_length, 
-                eos_token_id=self.eot_token_id,
-                do_sample=False,
-            )
-        else:
-            generations = self.t5.generate(
-                context, 
-                max_length=max_length, 
-                stopping_criteria=stopping_criteria,
-                do_sample=False,
-            )
-        return generations[0]
--- a/lm_eval/tasks/HuffPost.py
+++ b/lm_eval/tasks/HuffPost.py
-"""
-A dataset of approximately 200K news headlines from the year 2012 to 2018 collected from HuffPost.
-Homepage: https://www.kaggle.com/datasets/rmisra/news-category-dataset
-"""
-from lm_eval.base import PromptSourceTask
-_CITATION = """\
-@book{book,
-  author = {Misra, Rishabh and Grover, Jigyasa},
-  year = {2021},
-  month = {01},
-  pages = {},
-  title = {Sculpting Data for ML: The first act of Machine Learning},
-  isbn = {978-0-578-83125-1}
-}
-@dataset{dataset,
-  author = {Misra, Rishabh},
-  year = {2018},
-  month = {06},
-  pages = {},
-  title = {News Category Dataset},
-  doi = {10.13140/RG.2.2.20331.18729}
-}
-"""
-class HuffPost(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "khalidalt/HuffPost"
-    DATASET_NAME = None
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return False
-    def has_test_docs(self):
-        return True
-    def training_docs(self):
-        if self.has_training_docs():
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["test"]
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
-from promptsource.templates import DatasetTemplates
 from pprint import pprint
 from typing import List, Union
 import sacrebleu
 import lm_eval.base
 from . import superglue
 from . import glue
 from . import arc
@@ -54,27 +52,15 @@ from . import blimp
 from . import asdiv
 from . import gsm8k
 from . import storycloze
-from . import hans
-from . import gem_webnlg
-from . import lama
-# from . import e2e_nlg_cleaned
-from . import gem_xsum
-from . import gem_mlsum
-from . import wino_bias
-from . import e2e_nlg_cleaned
-from . import gem_asset_turk
-from . import crows_pairs_multilingual
-from . import lama
-from . import HuffPost
 ########################################
 # Translation tasks
 ########################################
 # 6 total
 gpt3_translation_benchmarks = {
-    "wmt14": ["en-fr", "fr-en"],  # French
+    "wmt14": ['en-fr', 'fr-en'],  # French
-    "wmt16": ["en-ro", "ro-en", "de-en", "en-de"],  # German, Romanian
+    "wmt16": ['en-ro', 'ro-en', 'de-en', 'en-de'],  # German, Romanian
 }
@@ -82,7 +68,7 @@ gpt3_translation_benchmarks = {
 selected_translation_benchmarks = {
    **gpt3_translation_benchmarks,
    "wmt20": sacrebleu.get_langpairs_for_testset("wmt20"),
-    "iwslt17": ["en-ar", "ar-en"],  # Arabic
+    "iwslt17": ['en-ar', 'ar-en']  # Arabic
 }
 # 319 total
@@ -106,7 +92,7 @@ TASK_REGISTRY = {
    "rte": glue.RTE,
    "qnli": glue.QNLI,
    "qqp": glue.QQP,
-    # "stsb": glue.STSB, # not implemented yet
+    #"stsb": glue.STSB, # not implemented yet
    "sst": glue.SST,
    "wnli": glue.WNLI,
    # SuperGLUE
@@ -117,37 +103,38 @@ TASK_REGISTRY = {
    "record": superglue.ReCoRD,
    "wic": superglue.WordsInContext,
    "wsc": superglue.SGWinogradSchemaChallenge,
    # Order by benchmark/genre?
    "coqa": coqa.CoQA,
    "drop": drop.DROP,
    "lambada": lambada.LAMBADA,
    "lambada_cloze": lambada_cloze.LAMBADA_cloze,
-    **gem_webnlg.construct_tasks(),
    # multilingual lambada
-    **gem_asset_turk.construct_tasks(),
    **lambada_multilingual.construct_tasks(),
    "wikitext": wikitext.WikiText,
    # "cbt-cn": cbt.CBTCN, # disabled pending context length fix
    # "cbt-ne": cbt.CBTNE, # disabled pending context length fix
    "piqa": piqa.PiQA,
    "prost": prost.PROST,
    "mc_taco": mc_taco.MCTACO,
    # Science related
-    "pubmedqa": pubmedqa.Pubmed_QA,
+    "pubmedqa" : pubmedqa.Pubmed_QA,
-    "sciq": sciq.SciQ,
+    "sciq" : sciq.SciQ,
-    "e2e_nlg_cleaned": e2e_nlg_cleaned.E2E_NLG_Cleaned,
    "qasper": qasper.QASPER,
-    "qa4mre_2011": qa4mre.QA4MRE_2011,
-    "qa4mre_2012": qa4mre.QA4MRE_2012,
+    "qa4mre_2011" : qa4mre.QA4MRE_2011,
-    "qa4mre_2013": qa4mre.QA4MRE_2013,
+    "qa4mre_2012" : qa4mre.QA4MRE_2012,
+    "qa4mre_2013" : qa4mre.QA4MRE_2013,
    "triviaqa": triviaqa.TriviaQA,
    "arc_easy": arc.ARCEasy,
    "arc_challenge": arc.ARCChallenge,
    # "quac": quac.QuAC, # not implemented yet
-    "lama_trex": lama.Trex,
-    "lama_squad": lama.Squad,
-    "lama_google_re": lama.google_re,
-    "lama_concptnet": lama.Conceptnet,
    "logiqa": logiqa.LogiQA,
    "hellaswag": hellaswag.HellaSwag,
    "swag": swag.SWAG,
@@ -155,7 +142,7 @@ TASK_REGISTRY = {
    "squad2": squad.SQuAD2,
    "race": race.RACE,
    # "naturalqs": naturalqs.NaturalQs, # not implemented yet
-    "headqa": headqa.HeadQAEsDeprecated,  # for backwards compat - headqa used to default to es
+    "headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es
    "headqa_es": headqa.HeadQAEs,
    "headqa_en": headqa.HeadQAEn,
    "mathqa": mathqa.MathQA,
@@ -165,20 +152,21 @@ TASK_REGISTRY = {
    "anli_r1": anli.ANLIRound1,
    "anli_r2": anli.ANLIRound2,
    "anli_r3": anli.ANLIRound3,
-    "hans": hans.HANS,
    "ethics_cm": hendrycks_ethics.EthicsCM,
    "ethics_deontology": hendrycks_ethics.EthicsDeontology,
    "ethics_justice": hendrycks_ethics.EthicsJustice,
    "ethics_utilitarianism_original": hendrycks_ethics.EthicsUtilitarianismOriginal,
    "ethics_utilitarianism": hendrycks_ethics.EthicsUtilitarianism,
    "ethics_virtue": hendrycks_ethics.EthicsVirtue,
-    #"tydiqa_primary" : TyDiQA.Primary, not implemented yet
-    #"tydiqa_secondary" : TyDiQA.Secondary, not implemented yet
+     "truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice,
-    "truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice,
+     "truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
-    "truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
    # dialogue
    "mutual": mutual.MuTual,
    "mutual_plus": mutual.MuTualPlus,
    # math
    "math_algebra": hendrycks_math.MathAlgebra,
    "math_counting_and_prob": hendrycks_math.MathCountingAndProbability,
@@ -189,6 +177,7 @@ TASK_REGISTRY = {
    "math_precalc": hendrycks_math.MathPrecalculus,
    "math_asdiv": asdiv.Asdiv,
    "gsm8k": gsm8k.GradeSchoolMath8K,
    # arithmetic
    "arithmetic_2da": arithmetic.Arithmetic2DPlus,
    "arithmetic_2ds": arithmetic.Arithmetic2DMinus,
@@ -202,18 +191,22 @@ TASK_REGISTRY = {
    "arithmetic_1dc": arithmetic.Arithmetic1DComposite,
    # TODO Perhaps make these groups of tasks
    #   e.g. anli, arithmetic, openai_translations, harness_translations
    # hendrycksTest (57 tasks)
    **hendrycks_test.create_all_tasks(),
    # e.g. wmt14-fr-en
    **translation.create_tasks_from_benchmarks(gpt3_translation_benchmarks),
    # chef's selection, mostly wmt20
    **translation.create_tasks_from_benchmarks(selected_translation_benchmarks),
    # Word Scrambling and Manipulation Tasks
    "anagrams1": unscramble.Anagrams1,
    "anagrams2": unscramble.Anagrams2,
    "cycle_letters": unscramble.CycleLetters,
    "random_insertion": unscramble.RandomInsertion,
    "reversed_words": unscramble.ReversedWords,
    # Pile
    "pile_arxiv": pile.PileArxiv,
    "pile_books3": pile.PileBooks3,
@@ -237,6 +230,7 @@ TASK_REGISTRY = {
    "pile_ubuntu-irc": pile.PileUbuntuIrc,
    "pile_wikipedia": pile.PileWikipedia,
    "pile_youtubesubtitles": pile.PileYoutubeSubtitles,
    # BLiMP
    "blimp_adjunct_island": blimp.BlimpAdjunctIsland,
    "blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement,
@@ -305,45 +299,11 @@ TASK_REGISTRY = {
    "blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance,
    "blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
    "blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
-    #GEM/mlsum
-    "mlsum_es":gem_mlsum.GEMMLSUMEs,
-    "mlsum_de":gem_mlsum.GEMMLSUMDe,
-    "mlsum_es_covid_challenge_set":gem_mlsum.GEMMLSUMEsChallgeTestCovid,
-    "mlsum_de_covid_challenge_set":gem_mlsum.GEMMLSUMDeChallgeTestCovid,
    # Requires manual download of data.
    # "storycloze_2016": storycloze.StoryCloze2016,
    # "storycloze_2018": storycloze.StoryCloze2018,
    # "sat": sat.SATAnalogies,
-    #GEM/xum
-    "gem_xsum": gem_xsum.GEMXSUM,
-    "gem_xsum_challenge_sample": gem_xsum.GEMXSUMChallgeSample,
-    "gem_xsum_challenge_test_backtranslation": gem_xsum.GEMXSUMChallgeTestBacktranslation,
-    "gem_xsum_challenge_test_bfp_02": gem_xsum.GEMXSUMChallgeTestBFP02,
-    "gem_xsum_challenge_test_bfp_05": gem_xsum.GEMXSUMChallgeTestBFP05,
-    "gem_xsum_challenge_test_nopunc": gem_xsum.GEMXSUMChallgeTestNopunc,
-    "gem_xsum_challenge_test_covid": gem_xsum.GEMXSUMChallgeTestCovid,
-   #LAMA
-    "lama-trex": lama.Trex,
-    "lama-squad": lama.Squad,
-    "lama-google_re": lama.google_re,
-    "lama-concptnet": lama.Conceptnet,
-    "bigscience-lama":lama.BigScienceLAMA,
-    # WinoBias
-    "wino_bias_type1_pro": wino_bias.WinoBiasType1Pro,
-    "wino_bias_type1_anti": wino_bias.WinoBiasType1Anti,
-    "wino_bias_type2_pro": wino_bias.WinoBiasType2Pro,
-    "wino_bias_type2_anti": wino_bias.WinoBiasType2Anti,
-    # Crows-Pairs
-    "crows_pairs_english": crows_pairs_multilingual.CrowsPairsEnglish,
-    "crows_pairs_french": crows_pairs_multilingual.CrowsPairsFrench,
-    # News
-    "huffpost": HuffPost.HuffPost,
 }
@@ -363,51 +323,19 @@ def get_task_name_from_object(task_object):
    for name, class_ in TASK_REGISTRY.items():
        if class_ is task_object:
            return name
    # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
-    return (
+    return task_object.EVAL_HARNESS_NAME if hasattr(task_object, "EVAL_HARNESS_NAME") else type(task_object).__name__
-        task_object.EVAL_HARNESS_NAME
-        if hasattr(task_object, "EVAL_HARNESS_NAME")
-        else type(task_object).__name__
-    )
 def get_task_dict(task_name_list: List[Union[str, lm_eval.base.Task]]):
    task_name_dict = {
        task_name: get_task(task_name)()
-        for task_name in task_name_list
+        for task_name in task_name_list if isinstance(task_name, str)
-        if isinstance(task_name, str)
    }
    task_name_from_object_dict = {
        get_task_name_from_object(task_object): task_object
-        for task_object in task_name_list
+        for task_object in task_name_list if not isinstance(task_object, str)
-        if not isinstance(task_object, str)
    }
    assert set(task_name_dict.keys()).isdisjoint(set(task_name_from_object_dict.keys()))
    return {**task_name_dict, **task_name_from_object_dict}
-def get_task_dict_promptsource(task_name_list: List[str]):
-    """Loads a task instance for each prompt written for that task."""
-    task_name_dict = {}
-    for task_name in task_name_list:
-        assert isinstance(task_name, str)
-        # Static version of the Task Use this to get HF dataset path / name.
-        static_task_obj = get_task(task_name)
-        # Create the proper task name arg for DatasetTemplates.
-        sub_task = (
-            f"/{static_task_obj.DATASET_NAME}" if static_task_obj.DATASET_NAME else ""
-        )
-        ps_task_name = f"{static_task_obj.DATASET_PATH}{sub_task}"
-        task_prompts = DatasetTemplates(ps_task_name)
-        for prompt_name in task_prompts.all_template_names:
-            prompt = task_prompts[prompt_name]
-            # NOTE: We choose a sep that can be easily split.
-            task_name_dict[f"{task_name}+{prompt_name}"] = get_task(task_name)(
-                prompt=prompt
-            )
-    return task_name_dict
--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
@@ -10,7 +10,7 @@ provided explanations.
 Homepage: "https://github.com/facebookresearch/anli"
 """
 import numpy as np
-from lm_eval.base import rf, PromptSourceTask
+from lm_eval.base import rf, Task
 from lm_eval.metrics import mean
@@ -30,7 +30,7 @@ _CITATION = """
 """
-class ANLIBase(PromptSourceTask):
+class ANLIBase(Task):
    VERSION = 0
    DATASET_PATH = "anli"
    DATASET_NAME = None
@@ -59,6 +59,51 @@ class ANLIBase(PromptSourceTask):
        if self.has_test_docs():
            return self.dataset["test_r" + str(self.SPLIT)]
+    def doc_to_text(self, doc):
+        # OA does this a bit weirdly: they prepend "anli 1:  anli 1:  " to the beginning
+        # of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly 
+        # appended onto the question, with no "Answer:" or even a newline. Do we *really* 
+        # want to do it exactly as OA did?
+        return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + ' True, False, or Neither?\nAnswer:'
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = contradiction
+        # Neither = neutral
+        return " " + ["True", "Neither", "False"][doc['label']]
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        ll_true, _ = rf.loglikelihood(ctx, " True") 
+        ll_neither, _ = rf.loglikelihood(ctx, " Neither") 
+        ll_false, _ = rf.loglikelihood(ctx, " False") 
+        return ll_true, ll_neither, ll_false
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        gold = doc["label"]
+        pred = np.argmax(results)
+        return {
+            "acc": pred == gold
+        }
    def aggregation(self):
        """
        :returns: {str: [float] -> float}

--- a/lm_eval/tasks/arithmetic.py
+++ b/lm_eval/tasks/arithmetic.py
@@ -58,11 +58,10 @@ class Arithmetic(Task):
    def construct_requests(self, doc, ctx):
        ll, is_prediction = rf.loglikelihood(ctx, doc["completion"])
-        return ll, is_prediction
+        return is_prediction
    def process_results(self, doc, results):
-        print(results)
+        is_prediction, = results
-        results = results
        return {
            "acc": is_prediction
        }

--- a/lm_eval/tasks/blimp.py
+++ b/lm_eval/tasks/blimp.py
@@ -10,7 +10,7 @@ grammars.
 Homepage: https://github.com/alexwarstadt/blimp
 """
-from lm_eval.base import rf, PromptSourceTask
+from lm_eval.base import rf, Task
 from lm_eval.metrics import mean
@@ -31,7 +31,7 @@ _CITATION = """
 """
-class BlimpTask(PromptSourceTask):
+class BlimpTask(Task):
    VERSION = 0
    DATASET_PATH = "blimp"
@@ -50,6 +50,58 @@ class BlimpTask(PromptSourceTask):
        # trained on this data.
        return self.dataset["train"]
+    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
+        assert num_fewshot == 0
+        assert rnd is not None, "A `random.Random` generator argument must be provided to `rnd`"
+        assert not provide_description, (
+            "The `provide_description` arg will be removed in future versions. To prepend "
+            "a custom description to the context, supply the corresponding string via the  "
+            "`description` arg."
+        )
+        if provide_description is not None:
+            # nudge people to not specify it at all
+            print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
+        return ""
+    def doc_to_text(self, doc):
+        # this method is invoked by tests only
+        return ""
+    def doc_to_target(self, doc):
+        # this method is invoked by tests only
+        return ""
+    def construct_requests(self, doc, ctx):
+        assert not ctx
+        # Calculate the loglikelihood for the good and the bad sentence.
+        # Note that loglikelihood translates the "" prefix to the "<|endoftext|>" token
+        return [
+            rf.loglikelihood("", doc["sentence_good"]),
+            rf.loglikelihood("", doc["sentence_bad"]),
+        ]
+    def process_results(self, doc, results):
+        likelihood1, likelihood2 = results
+        # the model got this case right iff the good sentence scored higher than the bad sentence
+        acc = 1.0 if likelihood1 > likelihood2 else 0.0
+        return {
+            "acc": acc,
+        }
+    def higher_is_better(self):
+        return {
+            "acc": True,
+        }
+    def aggregation(self):
+        return {
+            "acc": mean,
+        }
 class BlimpAdjunctIsland(BlimpTask):
    DATASET_NAME = "adjunct_island"

--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
@@ -12,7 +12,7 @@ Homepage: https://stanfordnlp.github.io/coqa/
 import inspect
 import transformers.data.metrics.squad_metrics as squad_metrics
 import lm_eval.datasets.coqa.coqa
-from lm_eval.base import PromptSourceTask, Task, rf, mean
+from lm_eval.base import Task, rf, mean
 from itertools import zip_longest
@@ -28,9 +28,9 @@ _CITATION = """
 """
-class CoQA(PromptSourceTask):
+class CoQA(Task):
    VERSION = 1
-    DATASET_PATH = "coqa"
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.coqa.coqa)
    DATASET_NAME = None
    def has_training_docs(self):
@@ -51,21 +51,44 @@ class CoQA(PromptSourceTask):
    def test_docs(self):
        pass
-    # @classmethod
+    def doc_to_text(self, doc):
-    # def get_answers(cls, doc, turn_id):
+        # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1} 
-    #     # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
+        # and a question qi, the task is to predict the answer ai
-    #     answers = []
+        doc_text = doc["story"] + '\n\n'
-    #     answer_forturn = doc["answers"]["input_text"][turn_id - 1]
+        for (q, a) in zip_longest(doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]):   # omit target answer ai
-    #     answers.append(answer_forturn)
+            question = f"Q: {q}\n\n"
-    #     additional_answers = doc.get("additional_answers")
+            answer = f"A: {a}\n\n" if a is not None else "A:"
-    #     if additional_answers:
+            doc_text += question + answer
-    #         for key in additional_answers:
+        return doc_text
-    #             additional_answer_for_turn = additional_answers[key]["input_text"][
-    #                 turn_id - 1
+    @classmethod
-    #             ]
+    def get_answers(cls, doc, turn_id):
-    #             if additional_answer_for_turn.lower() not in map(str.lower, answers):
+        # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
-    #                 answers.append(additional_answer_for_turn)
+        answers = []
-    #     return answers
+        answer_forturn = doc["answers"]["input_text"][turn_id - 1]
+        answers.append(answer_forturn)
+        additional_answers = doc.get("additional_answers")
+        if additional_answers:
+            for key in additional_answers:
+                additional_answer_for_turn = additional_answers[key]["input_text"][turn_id - 1]
+                if additional_answer_for_turn.lower() not in map(str.lower, answers):
+                    answers.append(additional_answer_for_turn)
+        return answers
+    @classmethod
+    def get_answer_choice(self, raw_text):
+        # Function maps answers to CoQA answer categories
+        # ~ 1/5 of the CoQA answers are Yes/No 
+        # ~ 2/3 of the CoQA answers are span-based
+        # (answers overlap with the passage ignoring punctuation and case mismatch)
+        if raw_text == "unknown":
+            return '0'
+        if squad_metrics.normalize_answer(raw_text) == "yes":
+            return '1'
+        if squad_metrics.normalize_answer(raw_text) == "no":
+            return '2'
+        return '3' # Not a yes/no question
    @staticmethod
    def compute_scores(gold_list, pred):
@@ -75,40 +98,40 @@ class CoQA(PromptSourceTask):
        em_sum = 0.0
        if len(gold_list) > 1:
            for i in range(len(gold_list)):
-                gold_answers = gold_list[0:i] + gold_list[i + 1 :]
+                gold_answers = gold_list[0:i] + gold_list[i + 1:]
                # predictions compared against (n) golds and take maximum
-                em_sum += max(
+                em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
-                    squad_metrics.compute_exact(a, pred) for a in gold_answers
-                )
                f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers)
        else:
            em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
            f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list)
-        return {
+        return {'em': em_sum / max(1, len(gold_list)), 'f1': f1_sum / max(1, len(gold_list))}
-            "em": em_sum / max(1, len(gold_list)),
-            "f1": f1_sum / max(1, len(gold_list)),
-        }
-    # def stopping_criteria(self):
+    def doc_to_target(self, doc, turnid=None):
-    #     return "\n\n"
+        # Default to prediction of last turn.
+        if turnid is None:
+            turnid = len(doc["questions"]["input_text"])
+        raw_text = doc['answers']["input_text"][turnid - 1]
+        return " " + raw_text
-    # def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx):
-    #     """Uses RequestFactory to construct Requests and returns an iterable of
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
-    #     Requests which will be sent to the LM.
+        Requests which will be sent to the LM.
-    #     :param doc:
+        :param doc:
-    #         The document as returned from training_docs, validation_docs, or test_docs.
+            The document as returned from training_docs, validation_docs, or test_docs.
-    #     :param ctx: str
+        :param ctx: str
-    #         The context string, generated by fewshot_context. This includes the natural
+            The context string, generated by fewshot_context. This includes the natural 
-    #         language description, as well as the few shot examples, and the question
+            language description, as well as the few shot examples, and the question
-    #         part of the document for `doc`.
+            part of the document for `doc`. 
-    #     """
+        """
-    #     return cont_request
+        cont_request = rf.greedy_until(ctx, ['\nQ:'])
+        return cont_request
    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a
+        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of
+        dict where keys are the names of submetrics and values are the values of 
        the metric for that one document
        :param doc:
@@ -116,19 +139,16 @@ class CoQA(PromptSourceTask):
        :param results:
            The results of the requests created in construct_requests.
        """
-        target = self.doc_to_target(doc).strip()
+        turn_id = len(doc["questions"]["input_text"])
-        pred = results[0].strip().split("\n")[0]
+        gold_list = self.get_answers(doc, turn_id)
-        scores = self.compute_scores([target], pred)
+        pred = results[0].strip().split('\n')[0]
-        out = {
+        scores = self.compute_scores(gold_list, pred)
-            "f1": scores["f1"],
-            "em": scores["em"],
-        }
-        if self.save_examples:
+        return {
-            example = {"target": target, "pred": pred}
+            "f1": scores['f1'],
-            return out, example
+            "em": scores['em'],
-        return out
+        }
    def higher_is_better(self):
        return {

--- a/lm_eval/tasks/crows_pairs_multilingual.py
+++ b/lm_eval/tasks/crows_pairs_multilingual.py
-"""
-French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked language models to a language other than English
-https://hal.inria.fr/hal-03629677/file/ACLFinal.pdf
-Measuring social biases in masked language models in English and French.
-https://gitlab.inria.fr/french-crows-pairs/acl-2022-paper-data-and-code/-/tree/main
-"""
-from lm_eval.base import PromptSourceTask
-_CITATION = """\
-@inproceedings{neveol2022french,
-  title={French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked language models to a language other than English},
-  author={N{\'e}v{\'e}ol, Aur{\'e}lie and Dupont, Yoann and Bezan{\c{c}}on, Julien and Fort, Kar{\"e}n},
-  booktitle={ACL 2022-60th Annual Meeting of the Association for Computational Linguistics},
-  year={2022}
-"""
-class CrowsPairsEnglish(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "oskarvanderwal/crows_pairs_multilingual"
-    DATASET_NAME = "english"
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return False
-    def has_test_docs(self):
-        return True
-    def training_docs(self):
-        pass
-    def validation_docs(self):
-        pass
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["test"]
-class CrowsPairsFrench(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "oskarvanderwal/crows_pairs_multilingual"
-    DATASET_NAME = "french"
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return False
-    def has_test_docs(self):
-        return True
-    def training_docs(self):
-        pass
-    def validation_docs(self):
-        pass
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["test"]
--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -18,7 +18,7 @@ import re
 import string
 import lm_eval.datasets.drop.drop
 from scipy.optimize import linear_sum_assignment
-from lm_eval.base import PromptSourceTask, rf
+from lm_eval.base import Task, rf
 from lm_eval.metrics import mean
@@ -37,9 +37,9 @@ _CITATION = """
 _ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-class DROP(PromptSourceTask):
+class DROP(Task):
    VERSION = 1
-    DATASET_PATH = "drop"  # inspect.getfile(lm_eval.datasets.drop.drop)
+    DATASET_PATH = inspect.getfile(lm_eval.datasets.drop.drop)
    DATASET_NAME = None
    def has_training_docs(self):
@@ -52,13 +52,46 @@ class DROP(PromptSourceTask):
        return False
    def training_docs(self):
-        # if self._training_docs is None:
+        if self._training_docs is None:
-        #     self._training_docs = list()
+            self._training_docs = list(map(self._process_doc, self.dataset["train"]))
-        # return self._training_docs
+        return self._training_docs
-        return self.dataset["train"]
    def validation_docs(self):
-        return self.dataset["validation"]
+        return map(self._process_doc, self.dataset["validation"])
+    def _process_doc(self, doc):
+        return {
+            "id": doc["query_id"],
+            "passage": doc["passage"],
+            "question": doc["question"],
+            "answers": self.get_answers(doc),
+        }
+    @classmethod
+    def get_answers(cls, qa):
+        def _flatten_validated_answers(validated_answers):
+            """ Flattens a dict of lists of validated answers.
+            {"number": ['1', '8'], ...}
+            -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
+            """
+            vas = []
+            for i in range(len(validated_answers["number"])):
+                vas.append({
+                    "number": validated_answers["number"][i],
+                    "date": validated_answers["date"][i],
+                    "spans": validated_answers["spans"][i],
+                })
+            return vas
+        answers = []
+        answers_set = set()
+        candidates = [qa["answer"]] + _flatten_validated_answers(qa["validated_answers"])
+        for candidate in candidates:
+            answer = cls.parse_answer(candidate)
+            if answer in answers_set:
+                continue
+            answers_set.add(answer)
+            answers.append(answer)
+        return answers
    @classmethod
    def parse_answer(cls, answer):
@@ -67,31 +100,29 @@ class DROP(PromptSourceTask):
            return (str(answer["number"]),)
        if answer["spans"] != []:
            return tuple(answer["spans"])
-        return (
+        return (" ".join([answer["date"]["day"],
-            " ".join(
+                          answer["date"]["month"],
-                [answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]]
+                          answer["date"]["year"]]).strip(),)
-            ).strip(),
-        )
-    # def doc_to_text(self, doc):
+    def doc_to_text(self, doc):
-    #     return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
+        return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
-    # def doc_to_target(self, doc):
+    def doc_to_target(self, doc):
-    #     return " " + ", ".join(doc["answers"][0])
+        return " " + ", ".join(doc["answers"][0])
-    # def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx):
-    #     """Uses RequestFactory to construct Requests and returns an iterable of
+        """Uses RequestFactory to construct Requests and returns an iterable of
-    #     Requests which will be sent to the LM.
+        Requests which will be sent to the LM.
-    #     :param doc:
+        :param doc:
-    #         The document as returned from training_docs, validation_docs, or test_docs.
+            The document as returned from training_docs, validation_docs, or test_docs.
-    #     :param ctx: str
+        :param ctx: str
-    #         The context string, generated by fewshot_context. This includes the natural
+            The context string, generated by fewshot_context. This includes the natural
-    #         language description, as well as the few shot examples, and the question
+            language description, as well as the few shot examples, and the question
-    #         part of the document for `doc`.
+            part of the document for `doc`.
-    #     """
+        """
-    #     conts = [rf.greedy_until(ctx, ["."])]
+        conts = [rf.greedy_until(ctx, ["."])]
-    #     return conts
+        return conts
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a
@@ -103,21 +134,7 @@ class DROP(PromptSourceTask):
        :param results:
            The results of the requests created in construct_requests.
        """
+        preds, golds = results, doc["answers"]
-        pred = results[0].strip()
-        target = self.doc_to_target(doc).strip()
-        print("*" * 80)
-        print(f"DOC: {doc}")
-        print(f"PS: {self.prompt.apply(doc)}")
-        print(f"TEXT: {self.doc_to_text(doc)}")
-        print(f"TARGET: {target} END TARGET")
-        print(f"PRED: {pred} END PRED")
-        print("*" * 80)
-        preds = [pred]
-        golds = [target]
        max_em = 0
        max_f1 = 0
        for gold_answer in golds:
@@ -125,7 +142,10 @@ class DROP(PromptSourceTask):
            if gold_answer[0].strip():
                max_em = max(max_em, exact_match)
                max_f1 = max(max_f1, f1_score)
-        return {"em": max_em, "f1": max_f1}
+        return {
+            "em": max_em,
+            "f1": max_f1
+        }
    def get_metrics(self, predicted, gold):
        """
@@ -138,9 +158,7 @@ class DROP(PromptSourceTask):
        predicted_bags = self._answer_to_bags(predicted)
        gold_bags = self._answer_to_bags(gold)
-        if set(predicted_bags[0]) == set(gold_bags[0]) and len(
+        if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
-            predicted_bags[0]
-        ) == len(gold_bags[0]):
            exact_match = 1.0
        else:
            exact_match = 0.0
@@ -172,9 +190,7 @@ class DROP(PromptSourceTask):
        for gold_index, gold_item in enumerate(gold):
            for pred_index, pred_item in enumerate(predicted):
                if self._match_numbers_if_present(gold_item, pred_item):
-                    scores[gold_index, pred_index] = self._compute_f1(
+                    scores[gold_index, pred_index] = self._compute_f1(pred_item, gold_item)
-                        pred_item, gold_item
-                    )
        row_ind, col_ind = linear_sum_assignment(-scores)
        max_scores = np.zeros([max(len(gold), len(predicted))])
@@ -240,11 +256,7 @@ class DROP(PromptSourceTask):
    def _normalize(self, answer):
        tokens = [
-            self._white_space_fix(
+            self._white_space_fix(self._remove_articles(self._fix_number(self._remove_punc(token.lower()))))
-                self._remove_articles(
-                    self._fix_number(self._remove_punc(token.lower()))
-                )
-            )
            for token in self._tokenize(answer)
        ]
        tokens = [token for token in tokens if token.strip()]
@@ -257,7 +269,10 @@ class DROP(PromptSourceTask):
            A dictionary where keys are the names of submetrics and values are
            functions that aggregate a list of metrics
        """
-        return {"em": mean, "f1": mean}
+        return {
+            "em": mean,
+            "f1": mean
+        }
    def higher_is_better(self):
        """
@@ -265,4 +280,7 @@ class DROP(PromptSourceTask):
            A dictionary where keys are the names of submetrics and values are
            whether a higher value of the submetric is better
        """
-        return {"em": True, "f1": True}
+        return {
+            "em": True,
+            "f1": True
+        }
--- a/lm_eval/tasks/e2e_nlg_cleaned.py
+++ b/lm_eval/tasks/e2e_nlg_cleaned.py
-"""
-Semantic Noise Matters for Neural Natural Language Generation
-http://arxiv.org/abs/1911.03905
-A cleaned version of the dataset from the E2E NLG Challenge.
-The dataset contains MR with restaurant attributes and corresponding descriptions.
-Homepage: https://github.com/tuetschek/e2e-cleaning
-"""
-from lm_eval.base import PromptSourceTask, rf
-from lm_eval import metrics
-_CITATION = """
-@inproceedings{dusek-etal-2019-semantic,
-    title = "Semantic Noise Matters for Neural Natural Language Generation",
-    author = "Du{\v{s}}ek, Ond{\v{r}}ej  and
-      Howcroft, David M.  and
-      Rieser, Verena",
-    booktitle = "Proceedings of the 12th International Conference on Natural Language Generation",
-    year = "2019",
-    address = "Tokyo, Japan",
-    publisher = "Association for Computational Linguistics",
-    url = "https://aclanthology.org/W19-8652",
-    doi = "10.18653/v1/W19-8652",
-    pages = "421--426",
-}
-"""
-# Work in progress
-class E2E_NLG_Cleaned(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "e2e_nlg_cleaned"
-    DATASET_NAME = None
-    def has_training_docs(self):
-        return True
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return True
-    def training_docs(self):
-        if self.has_training_docs():
-            # We cache training documents in `self._training_docs` for faster
-            # few-shot processing. If the data is too large to fit in memory,
-            # return the training data as a generator instead of a list.
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["test"]
-    def max_generation_length(self):
-        return 64
-    def invalid_doc_for_prompt(self, doc) -> bool:
-        """The QA prompts are not applicable to all the examples, we want to filter these out."""
-        return self.prompt.name.endswith("_qa") or self.prompt.name == "family_friendly_yes_no"
-    def doc_to_text(self, doc) -> str:
-        # if the response is not defined in PS, the text will be a single-element list containing an empty string
-        text = self.prompt.apply(doc)[0]
-        return text
-    def construct_requests(self, doc, ctx, args):
-        """Uses RequestFactory to construct Requests and returns an iterable of
-        Requests which will be sent to the LM.
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`.
-        """
-        _requests = []
-        # NOTE: In the future, target will be a list of strings.
-        request_args = {
-            "stopping_criteria": self.stopping_criteria(),
-            "max_generation_length": self.max_generation_length(),
-            "num_fewshot": args["num_fewshot"],
-        }
-        # Skip examples for which the templates are not applicable
-        if ctx != "":
-            cont_request = rf.greedy_until(ctx, request_args)
-            _requests.append(cont_request)
-        return _requests
--- a/lm_eval/tasks/gem_asset_turk.py
+++ b/lm_eval/tasks/gem_asset_turk.py
-"""
-ASSET: ASSET (Alva-Manchego et al., 2020) is multi-reference dataset
-for the evaluation of sentence simplification in English. The dataset
-uses the same 2,359 sentences from TurkCorpus (Xu et al., 2016)
-and each sentence is associated with 10 crowdsourced simplifications.
-Unlike previous simplification datasets, which contain a single
-transformation (e.g., lexical paraphrasing in TurkCorpus or sentence
-splitting in HSplit), the simplifications in ASSET encompass a variety
-of rewriting transformations.
-https://aclanthology.org/2020.acl-main.424.pdf
-TurkCorpus: TURKCorpus is a multi-reference dataset for the evaluation of
-sentence simplification in English. The dataset consists of 2,359 sentences
-from the Parallel Wikipedia Simplification (PWKP) corpus. Each sentence is
-associated with 8 crowdsourced simplifications that focus on only lexical
-paraphrasing (no sentence splitting or deletion).
-https://cocoxu.github.io/publications/tacl2016-smt-simplification.pdf
-"""
-from lm_eval.base import PromptSourceTask
-_CITATION = """
-@article{DBLP:journals/corr/abs-2005-00481,
-  author    = {Fernando Alva{-}Manchego and
-               Louis Martin and
-               Antoine Bordes and
-               Carolina Scarton and
-               Beno{\^{\i}}t Sagot and
-               Lucia Specia},
-  title     = {{ASSET:} {A} Dataset for Tuning and Evaluation of Sentence Simplification
-               Models with Multiple Rewriting Transformations},
-  journal   = {CoRR},
-  volume    = {abs/2005.00481},
-  year      = {2020},
-  url       = {https://arxiv.org/abs/2005.00481},
-  eprinttype = {arXiv},
-  eprint    = {2005.00481},
-  timestamp = {Thu, 14 Oct 2021 16:38:25 +0200},
-  biburl    = {https://dblp.org/rec/journals/corr/abs-2005-00481.bib},
-  bibsource = {dblp computer science bibliography, https://dblp.org}
-}"""
-""""@article{Xu-EtAl:2016:TACL,
- author = {Wei Xu and Courtney Napoles and Ellie Pavlick and Quanze Chen and Chris Callison-Burch},
- title = {Optimizing Statistical Machine Translation for Text Simplification},
- journal = {Transactions of the Association for Computational Linguistics},
- volume = {4},
- year = {2016},
- url = {https://cocoxu.github.io/publications/tacl2016-smt-simplification.pdf},
- pages = {401--415}
- }"""
-class AssetTurk(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "GEM/wiki_auto_asset_turk"
-    DATASET_NAME = None
-    SPLIT = None
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return True
-    def training_docs(self):
-        if self.has_training_docs():
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-    def test_docs(self):
-        return self.dataset[str(self.SPLIT)]
-    def max_generation_length(self):
-        return 200
-class AssetTest(AssetTurk):
-    SPLIT = "test_asset"
-class TurkTest(AssetTurk):
-    SPLIT = "test_turk"
-class AssetTest1(AssetTurk):
-    SPLIT = "challenge_test_asset_backtranslation"
-class AssetTest2(AssetTurk):
-    SPLIT = "challenge_test_asset_bfp02"
-class AssetTest3(AssetTurk):
-    SPLIT = "challenge_test_asset_bfp05"
-class AssetTest4(AssetTurk):
-    SPLIT = "challenge_test_asset_nopunc"
-class TurkTest1(AssetTurk):
-    SPLIT = "challenge_test_turk_backtranslation"
-class TurkTest2(AssetTurk):
-    SPLIT = "challenge_test_turk_bfp02"
-class TurkTest3(AssetTurk):
-    SPLIT = "challenge_test_turk_bfp05"
-class TurkTest4(AssetTurk):
-    SPLIT = "challenge_test_turk_nopunc"
-ASSET_TURK_CLASSES = [
-    AssetTest,
-    TurkTest,
-    TurkTest1,
-    TurkTest2,
-    TurkTest3,
-    TurkTest4,
-    AssetTest1,
-    AssetTest2,
-    AssetTest3,
-    AssetTest4,
-]
-def construct_tasks():
-    tasks = {}
-    for asset_turk_class in ASSET_TURK_CLASSES:
-        tasks[f"GEM/wiki_auto_asset_turk_{asset_turk_class.SPLIT}"] = asset_turk_class
-    return tasks
--- a/lm_eval/tasks/gem_mlsum.py
+++ b/lm_eval/tasks/gem_mlsum.py
-""" 
-MLSUM: The Multilingual Summarization Corpus
-https://aclanthology.org/2020.emnlp-main.647/
-This is the MLSUM subset of the GEM benchmark. MLSUM is the first large-scale MultiLingual SUMmarization dataset. 
-Obtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish. 
-Together with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.
-We report cross-lingual comparative analyses based on state-of-the-art systems. 
-These highlight existing biases which motivate the use of a multi-lingual dataset.
-Homepage: https://gitlab.lip6.fr/scialom/mlsum_data/-/raw/master/MLSUM/
-"""
-from numpy import True_
-from lm_eval.base import PromptSourceTask
-_CITATION = """
-@article{scialom2020mlsum,
-  title={MLSUM: The Multilingual Summarization Corpus},
-  author={Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo},
-  journal={arXiv preprint arXiv:2004.14900},
-  year={2020}
-}
-"""
-class GEMMLSUMEsBase(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "GEM/mlsum"
-    DATASET_NAME = "es"
-    def has_training_docs(self):
-        return True
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return True
-    def training_docs(self):
-        if self.has_training_docs():           
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["test"]
-class GEMMLSUMEs(GEMMLSUMEsBase):
-    '''this is for train/validation/test'''
-    SPLIT = ''
-class GEMMLSUMEsChallgeTestCovid(GEMMLSUMEsBase):
-     '''this is for challenge_test_covid'''
-     SPLIT = 'challenge_test_covid'
-     def has_training_docs(self):
-         return False
-     def has_validation_docs(self):
-         return False
-     def test_docs(self):
-         if self.has_test_docs():
-             return self.dataset[self.SPLIT] 
-class GEMMLSUMDeBase(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "GEM/mlsum"
-    DATASET_NAME = "de"
-    def has_training_docs(self):
-        return True
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return True
-    def training_docs(self):
-        if self.has_training_docs():           
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["test"]
-class GEMMLSUMDe(GEMMLSUMDeBase):
-    '''this is for train/validation/test'''
-    SPLIT = ''
-class GEMMLSUMDeChallgeTestCovid(GEMMLSUMDeBase):
-     '''this is for challenge_test_covid'''
-     SPLIT = 'challenge_test_covid'
-     def has_training_docs(self):
-         return False
-     def has_validation_docs(self):
-         return False
-     def test_docs(self):
-         if self.has_test_docs():
-             return self.dataset[self.SPLIT] 
--- a/lm_eval/tasks/gem_webnlg.py
+++ b/lm_eval/tasks/gem_webnlg.py
-"""
-The 2020 Bilingual, Bi-Directional WebNLG+ Shared Task:
-Overview and Evaluation Results (WebNLG+ 2020)
-https://aclanthology.org/2020.webnlg-1.7/
-WebNLG+ offers two challenges: (i) mapping sets of RDF triples
-to English or Russian text (generation) and (ii) converting
-English or Russian text to sets of RDF triples (semantic parsing).
-Compared to the eponymous WebNLG challenge, WebNLG+ provides an
-extended dataset that enable the training, evaluation, and
-comparison of microplanners and semantic parsers. In this paper,
-we present the results of the generation and semantic parsing
-task for both English and Russian and provide a brief
-description of the participating systems.
-"""
-from lm_eval.base import PromptSourceTask
-_CITATION = """
-@inproceedings{castro-ferreira-etal-2020-2020,
-    title = "The 2020 Bilingual, Bi-Directional {W}eb{NLG}+ Shared Task: Overview and Evaluation Results ({W}eb{NLG}+ 2020)",
-    author = "Castro Ferreira, Thiago  and
-      Gardent, Claire  and
-      Ilinykh, Nikolai  and
-      van der Lee, Chris  and
-      Mille, Simon  and
-      Moussallem, Diego  and
-      Shimorina, Anastasia",
-    booktitle = "Proceedings of the 3rd International Workshop on Natural Language Generation from the Semantic Web (WebNLG+)",
-    month = "12",
-    year = "2020",
-    address = "Dublin, Ireland (Virtual)",
-    publisher = "Association for Computational Linguistics",
-    url = "https://aclanthology.org/2020.webnlg-1.7",
-    pages = "55--76",
-    abstract = "WebNLG+ offers two challenges: (i) mapping sets of RDF triples to English or Russian text (generation) and (ii) converting English or Russian text to sets of RDF triples (semantic parsing). Compared to the eponymous WebNLG challenge, WebNLG+ provides an extended dataset that enable the training, evaluation, and comparison of microplanners and semantic parsers. In this paper, we present the results of the generation and semantic parsing task for both English and Russian and provide a brief description of the participating systems.",
-}
-"""
-class WebNLG(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "GEM/web_nlg"
-    DATASET_NAME = "en"
-    SPLIT = None
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return True
-    def training_docs(self):
-        if self.has_training_docs():
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-    def test_docs(self):
-        if self.has_test_docs():
-            if self.SPLIT is not None:
-                return self.dataset[str(self.SPLIT)]
-            else:
-                return self.dataset["test"]
-    def max_generation_length(self):
-        return 250
-class WebNLGRu(WebNLG):
-    DATASET_NAME = "ru"
-## En Challenge Sets
-class WebNLGEn1(WebNLG):
-    SPLIT = "challenge_validation_sample"
-class WebNLGEn2(WebNLG):
-    SPLIT = "challenge_test_scramble"
-class WebNLGEn3(WebNLG):
-    SPLIT = "challenge_test_numbers"
-## Ru Challenge sets
-class WebNLGRu1(WebNLG):
-    DATASET_NAME = "ru"
-    SPLIT = "challenge_validation_sample"
-class WebNLGRu2(WebNLG):
-    DATASET_NAME = "ru"
-    SPLIT = "challenge_test_scramble"
-WEBNLG_CLASSES = [
-    WebNLG,
-    WebNLGRu,
-    WebNLGEn1,
-    WebNLGEn2,
-    WebNLGEn3,
-    WebNLGRu1,
-    WebNLGRu2,
-]
-def construct_tasks():
-    tasks = {}
-    for webnlg_class in WEBNLG_CLASSES:
-        if webnlg_class.SPLIT is None:
-            tasks[f"GEM/web_nlg_{webnlg_class.DATASET_NAME}"] = webnlg_class
-        else:
-            tasks[
-                f"GEM/web_nlg_{webnlg_class.DATASET_NAME}_{webnlg_class.SPLIT}"
-            ] = webnlg_class
-    return tasks