Merge pull request #316 from jon-tow/master

Revert "Merge branch 'master' into master"

Merge pull request #316 from jon-tow/master
Revert "Merge branch 'master' into master"
cf074822 · Stella Biderman · GitHub · 5fe7e2c0 · 7585ec56 · 5fe7e2c0
Unverified Commit cf074822 authored Apr 29, 2022 by Stella Biderman Committed by GitHub Apr 29, 2022
13 changed files
--- a/lm_eval/tasks/gem_xsum.py
+++ b/lm_eval/tasks/gem_xsum.py
-"""
-Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization
-https://arxiv.org/pdf/1808.08745.pdf
-The dataset is for the task of abstractive summarization in its extreme form, its about summarizing a document in a single sentence. It introduces extreme summarization, a new single-document summarization task which does not favor extractive strategies and calls for an abstractive modeling approach. The idea is to create a short, one-sentence news summary answering the question "What is the article about?". 
-This particularly uses the dataset that is part of the GEM benchmark
-Homepage: https://github.com/EdinburghNLP/XSum
-The GEM Benchmark: Natural Language Generation, its Evaluation and Metrics
-https://arxiv.org/pdf/2102.01672v3.pdf
-Write a Short Description of the task.
-Homepage: https://gem-benchmark.com/data_cards/XSum
-"""
-from lm_eval.base import PromptSourceTask
-from lm_eval.base import Task, rf
-_CITATION = """
-@InProceedings{xsum-emnlp,
-  author =      "Shashi Narayan and Shay B. Cohen and Mirella Lapata",
-  title =       "Don't Give Me the Details, Just the Summary! {T}opic-Aware Convolutional Neural Networks for Extreme Summarization",
-  booktitle =   "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing ",
-  year =        "2018",
-  address =     "Brussels, Belgium",
-}
-"""
-class GEMXSUMBase(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "GEM/xsum"
-    DATASET_NAME = None
-    SPLIT = None
-    def has_training_docs(self):
-        return True
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return True
-    def training_docs(self):
-        if self.has_training_docs():
-            # We cache training documents in `self._training_docs` for faster
-            # few-shot processing. If the data is too large to fit in memory,
-            # return the training data as a generator instead of a list.
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["test"]
-    def max_generation_length(self):
-            return 64
-class GEMXSUM(GEMXSUMBase):
-    '''this is for train/validation/test'''
-    SPLIT = ''
-class GEMXSUMChallgeSample(GEMXSUMBase):
-    '''this is for challenge_train_sample/challenge_validation_sample'''
-    SPLIT = 'challenge_sample'
-    def has_test_docs(self):
-        return False
-    def training_docs(self):
-        if self.has_training_docs():
-            # We cache training documents in `self._training_docs` for faster
-            # few-shot processing. If the data is too large to fit in memory,
-            # return the training data as a generator instead of a list.
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["challenge_train_sample"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["challenge_validation_sample"]
-class GEMXSUMChallgeTestBacktranslation(GEMXSUMBase):
-    '''this is for challenge_test_backtranslation'''
-    SPLIT = 'challenge_test_backtranslation'
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return False
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset[self.SPLIT]
-class GEMXSUMChallgeTestBFP02(GEMXSUMBase):
-    '''this is for challenge_test_bfp_02'''
-    SPLIT = 'challenge_test_bfp_02'
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return False
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset[self.SPLIT]
-class GEMXSUMChallgeTestBFP05(GEMXSUMBase):
-    '''this is for challenge_test_bfp_05'''
-    SPLIT = 'challenge_test_bfp_05'
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return False
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset[self.SPLIT]
-class GEMXSUMChallgeTestNopunc(GEMXSUMBase):
-    '''this is for challenge_test_nopunc'''
-    SPLIT = 'challenge_test_nopunc'
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return False
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset[self.SPLIT]
-class GEMXSUMChallgeTestCovid(GEMXSUMBase):
-    '''this is for challenge_test_covid'''
-    SPLIT = 'challenge_test_covid'
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return False
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset[self.SPLIT]
\ No newline at end of file
--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -14,7 +14,7 @@ respect to a wide range of linguistic phenomena found in natural language.
 Homepage: https://gluebenchmark.com/
 """
 import numpy as np
-from lm_eval.base import PromptSourceTask, rf, Task
+from lm_eval.base import rf, Task
 from lm_eval.metrics import mean, matthews_corrcoef, f1_score, yesno
 from lm_eval.utils import general_detokenize
@@ -45,7 +45,7 @@ _CITATION = """
 # Single-Sentence Tasks
-class CoLA(PromptSourceTask):
+class CoLA(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "cola"
@@ -67,20 +67,37 @@ class CoLA(PromptSourceTask):
    def validation_docs(self):
        return self.dataset["validation"]
-    # def process_results(self, doc, results):
+    def doc_to_text(self, doc):
-    #     answer_choices_list = self.prompt.get_answer_choices_list(doc)
+        return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])
-    #     pred = np.argmax(results)
-    #     target = answer_choices_list.index(self.doc_to_target(doc).strip())
-    #     return {"mcc": (target, pred)}
-    # def higher_is_better(self):
+    def doc_to_target(self, doc):
-    #     return {"mcc": True}
+        return " {}".format({1: "yes", 0: "no"}[doc["label"]])
-    # def aggregation(self):
+    def construct_requests(self, doc, ctx):
-    #     return {"mcc": matthews_corrcoef}
+        ll_true, _ = rf.loglikelihood(ctx, " yes")
+        ll_false, _ = rf.loglikelihood(ctx, " no")
+        return ll_true, ll_false
+    def process_results(self, doc, results):
+        ll_true, ll_false = results
+        pred = ll_true > ll_false
+        gold = doc["label"]
+        return {
+            "mcc": (gold, pred)
+        }
-class SST(PromptSourceTask):
+    def higher_is_better(self):
+        return {
+            "mcc": True
+        }
+    def aggregation(self):
+        return {
+            "mcc": matthews_corrcoef
+        }
+class SST(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "sst2"
@@ -102,11 +119,42 @@ class SST(PromptSourceTask):
    def validation_docs(self):
        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        return "{}\nQuestion: Is this sentence positive or negative?\nAnswer:".format(
+            general_detokenize(doc["sentence"]),
+        )
+    def doc_to_target(self, doc):
+        return " {}".format({1: "positive", 0: "negative"}[doc["label"]])
+    def construct_requests(self, doc, ctx):
+        ll_positive, _ = rf.loglikelihood(ctx, " positive")
+        ll_negative, _ = rf.loglikelihood(ctx, " negative")
+        return ll_positive, ll_negative
+    def process_results(self, doc, results):
+        ll_positive, ll_negative = results
+        pred = ll_positive > ll_negative
+        gold = doc["label"]
+        return {
+            "acc": pred == gold
+        }
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
+    def aggregation(self):
+        return {
+            "acc": mean
+        }
 # Inference Tasks
-class MNLI(PromptSourceTask):
+class MNLI(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "mnli"
@@ -133,6 +181,41 @@ class MNLI(PromptSourceTask):
        if self.has_test_docs():
            return self.dataset["test_matched"]
+    def doc_to_text(self, doc):
+        return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
+            doc["premise"],
+            doc["hypothesis"].strip() + ('' if doc["hypothesis"].strip().endswith('.') else '.'),
+        )
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = contradiction
+        # Neither = neutral
+        return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
+    def construct_requests(self, doc, ctx):
+        ll_true, _ = rf.loglikelihood(ctx, " True")
+        ll_neither, _ = rf.loglikelihood(ctx, " Neither")
+        ll_false, _ = rf.loglikelihood(ctx, " False")
+        return ll_true, ll_neither, ll_false
+    def process_results(self, doc, results):
+        gold = doc["label"]
+        pred = np.argmax(results)
+        return {
+            "acc": pred == gold
+        }
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
+    def aggregation(self):
+        return {
+            "acc": mean
+        }
 class MNLIMismatched(MNLI):
    VERSION = 0
@@ -146,7 +229,7 @@ class MNLIMismatched(MNLI):
            return self.dataset["test_mismatched"]
-class QNLI(PromptSourceTask):
+class QNLI(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "qnli"
@@ -168,8 +251,42 @@ class QNLI(PromptSourceTask):
    def validation_docs(self):
        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        return "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
+            doc["question"],
+            doc["sentence"],
+        )
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = not entailment
+        return " {}".format({0: "yes", 1: "no"}[doc["label"]])
+    def construct_requests(self, doc, ctx):
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
+        return ll_yes, ll_no
+    def process_results(self, doc, results):
+        ll_yes, ll_no = results
+        pred = ll_no > ll_yes
+        gold = doc["label"]
+        return {
+            "acc": pred == gold
+        }
-class WNLI(PromptSourceTask):
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
+    def aggregation(self):
+        return {
+            "acc": mean
+        }
+class WNLI(Task):
    VERSION = 1
    DATASET_PATH = "glue"
    DATASET_NAME = "wnli"
@@ -184,13 +301,49 @@ class WNLI(PromptSourceTask):
        return False
    def training_docs(self):
-        return self.dataset["train"]
+        if self._training_docs is None:
+            self._training_docs = list(self.dataset["train"])
+        return self._training_docs
    def validation_docs(self):
        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        return "{}\nQuestion: {} True or False?\nAnswer:".format(
+            doc["sentence1"],
+            doc["sentence2"],
+        )
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = not_entailment
+        return " {}".format({0: "False", 1: "True"}[doc["label"]])
+    def construct_requests(self, doc, ctx):
+        ll_true, _ = rf.loglikelihood(ctx, " True")
+        ll_false, _ = rf.loglikelihood(ctx, " False")
+        return ll_true, ll_false
+    def process_results(self, doc, results):
+        ll_true, ll_false = results
+        pred = ll_true > ll_false
+        gold = doc["label"]
+        return {
+            "acc": pred == gold
+        }
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
+    def aggregation(self):
+        return {
+            "acc": mean
+        }
-class RTE(PromptSourceTask):
+class RTE(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "rte"
@@ -212,17 +365,45 @@ class RTE(PromptSourceTask):
    def validation_docs(self):
        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        return "{}\nQuestion: {} True or False?\nAnswer:".format(
+            doc["sentence1"],
+            doc["sentence2"],
+        )
+    def doc_to_target(self, doc):
+        # 0 = entailment
+        # 1 = not_entailment
+        return " {}".format({0: "True", 1: "False"}[doc["label"]])
+    def construct_requests(self, doc, ctx):
+        ll_true, _ = rf.loglikelihood(ctx, " True")
+        ll_false, _ = rf.loglikelihood(ctx, " False")
+        return ll_true, ll_false
+    def process_results(self, doc, results):
+        ll_true, ll_false = results
+        pred = ll_false > ll_true
+        gold = doc["label"]
+        return {
+            "acc": pred == gold
+        }
    def higher_is_better(self):
-        return {"acc": True}
+        return {
+            "acc": True
+        }
    def aggregation(self):
-        return {"acc": mean}
+        return {
+            "acc": mean
+        }
 # Similarity and Paraphrase Tasks
-class MRPC(PromptSourceTask):
+class MRPC(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "mrpc"
@@ -244,8 +425,43 @@ class MRPC(PromptSourceTask):
    def validation_docs(self):
        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        return "Sentence 1: {}\nSentence 2: {}\nQuestion: Do both sentences mean the same thing?\nAnswer:".format(
+            general_detokenize(doc["sentence1"]),
+            general_detokenize(doc["sentence2"]),
+        )
+    def doc_to_target(self, doc):
+        return " {}".format(yesno(doc["label"]))
+    def construct_requests(self, doc, ctx):
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
+        return ll_yes, ll_no
+    def process_results(self, doc, results):
+        ll_yes, ll_no = results
+        gold = doc["label"]
+        pred = ll_yes > ll_no
+        return {
+            "acc": pred == gold,
+            "f1": (gold, pred),
+        }
-class QQP(PromptSourceTask):
+    def higher_is_better(self):
+        return {
+            "acc": True,
+            "f1": True
+        }
+    def aggregation(self):
+        return {
+            "acc": mean,
+            "f1": f1_score
+        }
+class QQP(Task):
    VERSION = 0
    DATASET_PATH = "glue"
    DATASET_NAME = "qqp"
@@ -267,6 +483,41 @@ class QQP(PromptSourceTask):
    def validation_docs(self):
        return self.dataset["validation"]
+    def doc_to_text(self, doc):
+        return "Question 1: {}\nQuestion 2: {}\nQuestion: Do both questions ask the same thing?\nAnswer:".format(
+            doc["question1"],
+            doc["question2"],
+        )
+    def doc_to_target(self, doc):
+        return " {}".format(yesno(doc["label"]))
+    def construct_requests(self, doc, ctx):
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        ll_no, _ = rf.loglikelihood(ctx, " no")
+        return ll_yes, ll_no
+    def process_results(self, doc, results):
+        ll_yes, ll_no = results
+        gold = doc["label"]
+        pred = ll_yes > ll_no
+        return {
+            "acc": pred == gold,
+            "f1": (gold, pred),
+        }
+    def higher_is_better(self):
+        return {
+            "acc": True,
+            "f1": True
+        }
+    def aggregation(self):
+        return {
+            "acc": mean,
+            "f1": f1_score
+        }
 class STSB(Task):
    VERSION = 0
@@ -303,22 +554,22 @@ class STSB(Task):
        return " {}".format(doc["label"])
    def construct_requests(self, doc, ctx):
-        """Uses RequestFactory to construct Requests and returns an iterable of
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
        Requests which will be sent to the LM.
        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural
+            The context string, generated by fewshot_context. This includes the natural 
            language description, as well as the few shot examples, and the question
-            part of the document for `doc`.
+            part of the document for `doc`. 
        """
        # TODO: implement evaluation.
-        raise NotImplementedError("Evaluation not implemented")
+        raise NotImplementedError('Evaluation not implemented')
    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a
+        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of
+        dict where keys are the names of submetrics and values are the values of 
        the metric for that one document
        :param doc:
@@ -327,22 +578,22 @@ class STSB(Task):
            The results of the requests created in construct_requests.
        """
        # TODO: implement evaluation.
-        raise NotImplementedError("Evaluation not implemented")
+        raise NotImplementedError('Evaluation not implemented')
    def aggregation(self):
        """
        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are
+            A dictionary where keys are the names of submetrics and values are 
            functions that aggregate a list of metrics
        """
        # TODO: implement evaluation.
-        raise NotImplementedError("Evaluation not implemented")
+        raise NotImplementedError('Evaluation not implemented')
    def higher_is_better(self):
        """
        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are
+            A dictionary where keys are the names of submetrics and values are 
            whether a higher value of the submetric is better
        """
        # TODO: implement evaluation.
-        raise NotImplementedError("Evaluation not implemented")
+        raise NotImplementedError('Evaluation not implemented')
--- a/lm_eval/tasks/hans.py
+++ b/lm_eval/tasks/hans.py
-"""
-Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural Language Inference
-https://arxiv.org/abs/1902.01007
-A controlled evaluation set called HANS (Heuristic Analysis for NLI Systems),
-which contains many examples where the heuristics fail.
-Homepage: https://github.com/tommccoy1/hans
-"""
-from lm_eval.base import PromptSourceTask
-_CITATION = """
-@inproceedings{mccoy-etal-2019-right,
-    title = "Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural Language Inference",
-    author = "McCoy, Tom  and
-      Pavlick, Ellie  and
-      Linzen, Tal",
-    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
-    month = jul,
-    year = "2019",
-    address = "Florence, Italy",
-    publisher = "Association for Computational Linguistics",
-    url = "https://aclanthology.org/P19-1334",
-    doi = "10.18653/v1/P19-1334",
-    pages = "3428--3448",
-    abstract = "A machine learning system can score well on a given test set by relying on heuristics that are effective for frequent example types but break down in more challenging cases. We study this issue within natural language inference (NLI), the task of determining whether one sentence entails another. We hypothesize that statistical NLI models may adopt three fallible syntactic heuristics: the lexical overlap heuristic, the subsequence heuristic, and the constituent heuristic. To determine whether models have adopted these heuristics, we introduce a controlled evaluation set called HANS (Heuristic Analysis for NLI Systems), which contains many examples where the heuristics fail. We find that models trained on MNLI, including BERT, a state-of-the-art model, perform very poorly on HANS, suggesting that they have indeed adopted these heuristics. We conclude that there is substantial room for improvement in NLI systems, and that the HANS dataset can motivate and measure progress in this area.",
-}
-"""
-class HANS(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "hans"
-    DATASET_NAME = None
-    def has_training_docs(self):
-        return True
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return False
-    def training_docs(self):
-        if self.has_training_docs():
-            # We cache training documents in `self._training_docs` for faster
-            # few-shot processing. If the data is too large to fit in memory,
-            # return the training data as a generator instead of a list.
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["test"]
--- a/lm_eval/tasks/lama.py
+++ b/lm_eval/tasks/lama.py
-"""
-https://arxiv.org/abs/1909.01066
-https://arxiv.org/abs/2005.04611
-LAMA is a prob dataset to test the factual and commonsense knowledge in language models. The dataset includes a subset of 
-Google_RE (https://code.google.com/archive/p/relation-extraction-corpus/), TRex (subset of wikidata triples), 
-Conceptnet (https://github.com/commonsense/conceptnet5/wiki) and Squad. 
-Homepage: https://github.com/facebookresearch/LAMA
-"""
-from lm_eval.base import PromptSourceTask
-import numpy as np 
-from lm_eval.metrics import mean
-from typing import Optional
-_CITATION = """
-@inproceedings{petroni2019language, title={Language Models as Knowledge Bases?},
-               author={F. Petroni, T. Rockt{"{a}}schel, A. H. Miller, P. Lewis, A. Bakhtin, Y. Wu and S. Riedel},
-               booktitle={In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2019}, year={2019} }
-@inproceedings{petroni2020how,
-               title={How Context Affects Language Models' Factual Predictions},
-               author={Fabio Petroni and Patrick Lewis and Aleksandra Piktus and Tim Rockt{"a}schel and Yuxiang Wu and Alexander H. Miller and Sebastian Riedel},
-               booktitle={Automated Knowledge Base Construction}, year={2020}, url={https://openreview.net/forum?id=025X0zPfn} }
-"""
-class BigScienceLAMA(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "janck/bigscience-lama"
-    DATASET_NAME = None
-    def has_training_docs(self):
-        # TODO: Fill in the return with `True` if the Task has training data; else `False`.
-        return False
-    def has_validation_docs(self):
-        # TODO: Fill in the return with `True` if the Task has validation data; else `False`.
-        return False
-    def has_test_docs(self):
-        # TODO: Fill in the return with `True` if the Task has test data; else `False`.
-        return True
-    def training_docs(self):
-        if self.has_training_docs():
-            return self.dataset["train"]
-class Trex(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "lama"
-    DATASET_NAME = "trex"
-    def has_training_docs(self):
-        # TODO: Fill in the return with `True` if the Task has training data; else `False`.
-        return False
-    def has_validation_docs(self):
-        # TODO: Fill in the return with `True` if the Task has validation data; else `False`.
-        return False
-    def has_test_docs(self):
-        # TODO: Fill in the return with `True` if the Task has test data; else `False`.
-        return True
-    def training_docs(self):
-        if self.has_training_docs():
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["train"]
-    def process_results(self, doc, results):
-        out = {}
-        #gold = doc
-        pred = results[0].strip()
-        target = self.doc_to_target(doc)['obj_label']
-        #pred = np.argmax(results)
-        out["acc"] = pred == target
-        if self.save_examples:
-            example = {
-                "pred": pred,
-                "target": target,
-            }
-            return out, example
-        return out
-    def higher_is_better(self):
-        return {"acc": True}
-    def aggregation(self):
-        return {"acc": mean}
-    def doc_to_target(self, doc):
-        return doc
-class google_re(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "lama"
-    DATASET_NAME = "google_re"
-    def has_training_docs(self):
-        # TODO: Fill in the return with `True` if the Task has training data; else `False`.
-        return False
-    def has_validation_docs(self):
-        # TODO: Fill in the return with `True` if the Task has validation data; else `False`.
-        return False
-    def has_test_docs(self):
-        # TODO: Fill in the return with `True` if the Task has test data; else `False`.
-        return True
-    def training_docs(self):
-        if self.has_training_docs():
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["train"]
-    def process_results(self, doc, results):
-        out = {}
-        pred = results[0].strip()
-        target = self.doc_to_target(doc)['obj_label']
-        out["acc"] = pred == target
-        if self.save_examples:
-            example = {
-                "pred": pred,
-                "target": target,
-            }
-            return out, example
-        return out
-    def higher_is_better(self):
-        return {"acc": True}
-    def aggregation(self):
-        return {"acc": mean}
-    def doc_to_target(self, doc):
-        return doc
-class Conceptnet(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "lama"
-    DATASET_NAME = "conceptnet"
-    def has_training_docs(self):
-        # TODO: Fill in the return with `True` if the Task has training data; else `False`.
-        return False
-    def has_validation_docs(self):
-        # TODO: Fill in the return with `True` if the Task has validation data; else `False`.
-        return False
-    def has_test_docs(self):
-        # TODO: Fill in the return with `True` if the Task has test data; else `False`.
-        return True
-    def training_docs(self):
-        if self.has_training_docs():
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["train"]
-    def process_results(self, doc, results):
-        out = {}
-        pred = results[0].strip()
-        target = self.doc_to_target(doc)['obj_label']
-        out["acc"] = pred == target
-        if self.save_examples:
-            example = {
-                "pred": pred,
-                "target": target,
-            }
-            return out, example
-        return out
-    def higher_is_better(self):
-        return {"acc": True}
-    def aggregation(self):
-        return {"acc": mean}
-    def doc_to_target(self, doc):
-        return doc
-class Squad(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "lama"
-    DATASET_NAME = "squad"
-    def has_training_docs(self):
-        # TODO: Fill in the return with `True` if the Task has training data; else `False`.
-        return False
-    def has_validation_docs(self):
-        # TODO: Fill in the return with `True` if the Task has validation data; else `False`.
-        return False
-    def has_test_docs(self):
-        # TODO: Fill in the return with `True` if the Task has test data; else `False`.
-        return True
-    def training_docs(self):
-        if self.has_training_docs():
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train"])
-            return self._training_docs
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["validation"]
-    def test_docs(self):
-        if self.has_test_docs():
-            self._test_docs = list(self.dataset["train"])
-            return self._test_docs
-    def process_results(self, doc, results):
-        out = {}
-        pred = results[0].strip()
-        target = self.doc_to_target(doc)['obj_label']
-        #pred = np.argmax(results)
-        out["acc"] = pred == target
-        if self.save_examples:
-            example = {
-                "pred": pred,
-                "target": target,
-            }
-            return out, example
-        return out
-    def higher_is_better(self):
-        return {"acc": True}
-    def aggregation(self):
-        return {"acc": mean}
-    def doc_to_target(self, doc):
-        return doc
-    def max_generation_length(self) -> Optional[int]:
-        """Denote where the max length of the generation if it is obvious from the task."""
-        return 5
--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
@@ -12,7 +12,7 @@ Homepage: https://www.cs.cmu.edu/~glai1/data/race/
 import collections
 import datasets
 import numpy as np
-from lm_eval.base import PromptSourceTask, rf
+from lm_eval.base import rf, Task
 from lm_eval.metrics import mean
@@ -34,13 +34,13 @@ class each:
        return list(map(self.f, other))
-class RACE(PromptSourceTask):
+class RACE(Task):
    VERSION = 1
    DATASET_PATH = "race"
    DATASET_NAME = "high"
    cache = {}
-    letter_to_num = {"A": 0, "B": 1, "C": 2, "D": 3}
+    letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    def has_training_docs(self):
        return True
@@ -51,92 +51,83 @@ class RACE(PromptSourceTask):
    def has_test_docs(self):
        return True
-    # def _collate_data(self, set):
+    def _collate_data(self, set):
-    #     if set in self.cache:
+        if set in self.cache:
-    #         return self.cache[set]
+            return self.cache[set]
-    #     # One big issue with HF's implementation of this dataset: it makes a
+        # One big issue with HF's implementation of this dataset: it makes a
-    #     # separate document for each question; meanwhile, in the GPT3 paper it
+        # separate document for each question; meanwhile, in the GPT3 paper it
-    #     # is shown that one document is made per passage.
+        # is shown that one document is made per passage.
-    #     r = collections.defaultdict(list)
+        r = collections.defaultdict(list)
-    #     for item in datasets.load_dataset(
+        for item in datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)[set]:
-    #         path=self.DATASET_PATH, name=self.DATASET_NAME
+            r[item['article']].append(item)
-    #     )[set]:
-    #         r[item["article"]].append(item)
+        res = list(r.values() >> each(lambda x: {
+            'article': x[0]['article'],
-    #     res = list(
+            'problems': x >> each(lambda y: {
-    #         r.values()
+                'question': y['question'],
-    #         >> each(
+                'answer': y['answer'],
-    #             lambda x: {
+                'options': y['options'],
-    #                 "article": x[0]["article"],
+            })
-    #                 "problems": x
+        }))
-    #                 >> each(
-    #                     lambda y: {
+        self.cache[set] = res
-    #                         "question": y["question"],
+        return res
-    #                         "answer": y["answer"],
-    #                         "options": y["options"],
-    #                     }
-    #                 ),
-    #             }
-    #         )
-    #     )
-    #     self.cache[set] = res
-    #     return res
    def training_docs(self):
-        return self.dataset["train"]
+        return self._collate_data("train")
    def validation_docs(self):
-        return self.dataset["validation"]
+        return self._collate_data("validation")
    def test_docs(self):
-        return self.dataset["test"]
+        return self._collate_data("test")
    @classmethod
    def get_answer_option(cls, problem):
-        answer = cls.letter_to_num[problem["answer"]]
+        answer = cls.letter_to_num[problem['answer']]
-        return problem["options"][answer]
+        return problem['options'][answer]
    @classmethod
    def last_problem(cls, doc):
-        return doc["problems"][-1]
+        return doc['problems'][-1]
-    # def doc_to_text(self, doc):
+    def doc_to_text(self, doc):
-    #     text = 'Article: ' + doc['article'] + '\n\n'
+        text = 'Article: ' + doc['article'] + '\n\n'
-    #     for problem in doc['problems'][:-1]:
+        for problem in doc['problems'][:-1]:
-    #         if problem['question'][-6:] == '  _  .':
+            if problem['question'][-6:] == '  _  .':
-    #             text += problem['question'][-5:] + self.get_answer_option(problem) + '\n'
+                text += problem['question'][-5:] + self.get_answer_option(problem) + '\n'
-    #         else:
+            else:
-    #             question = 'Question: ' + problem['question'] + '\n'
+                question = 'Question: ' + problem['question'] + '\n'
-    #             answer = 'Answer: ' + self.get_answer_option(problem) + '\n'
+                answer = 'Answer: ' + self.get_answer_option(problem) + '\n'
-    #             text += question + answer
+                text += question + answer
-    #     text += self.last_problem(doc)['question']
+        text += self.last_problem(doc)['question']
-    #     return text
+        return text
-    # def doc_to_target(self, doc):
+    def doc_to_target(self, doc):
-    #     return " " + self.get_answer_option(self.last_problem(doc))
+        return " " + self.get_answer_option(self.last_problem(doc))
-    # def construct_requests(self, doc, ctx):
+    def construct_requests(self, doc, ctx):
-    #     """Uses RequestFactory to construct Requests and returns an iterable of
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
-    #     Requests which will be sent to the LM.
+        Requests which will be sent to the LM.
-    #     :param doc:
+        :param doc:
-    #         The document as returned from training_docs, validation_docs, or test_docs.
+            The document as returned from training_docs, validation_docs, or test_docs.
-    #     :param ctx: str
+        :param ctx: str
-    #         The context string, generated by fewshot_context. This includes the natural
+            The context string, generated by fewshot_context. This includes the natural 
-    #         language description, as well as the few shot examples, and the question
+            language description, as well as the few shot examples, and the question
-    #         part of the document for `doc`.
+            part of the document for `doc`. 
-    #     """
+        """
-    #     problem = self.last_problem(doc)
+        problem = self.last_problem(doc)
-    #     ll_choices = [
+        ll_choices = [
-    #         rf.loglikelihood(ctx, " " + problem["options"][i])[0] for i in range(4)
+            rf.loglikelihood(ctx, " " + problem['options'][i])[0]
-    #     ]
+            for i in range(4)
-    #     return ll_choices
+        ]
+        return ll_choices
    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a
+        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of
+        dict where keys are the names of submetrics and values are the values of 
        the metric for that one document
        :param doc:
@@ -144,24 +135,28 @@ class RACE(PromptSourceTask):
        :param results:
            The results of the requests created in construct_requests.
        """
-        #
+        gold = self.letter_to_num[self.last_problem(doc)['answer']]
-        gold = self.letter_to_num[self.doc_to_target(doc)]
-        # gold = self.letter_to_num[self.last_problem(doc)["answer"]]
        pred = np.argmax(results)
-        return {"acc": int(pred == gold)}
+        return {
+            "acc": int(pred == gold)
+        }
    def aggregation(self):
        """
        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are
+            A dictionary where keys are the names of submetrics and values are 
            functions that aggregate a list of metrics
        """
-        return {"acc": mean}
+        return {
+            "acc": mean
+        }
    def higher_is_better(self):
        """
        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are
+            A dictionary where keys are the names of submetrics and values are 
            whether a higher value of the submetric is better
        """
-        return {"acc": True}
+        return {
+            "acc": True
+        }
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
--- a/lm_eval/tasks/wino_bias.py
+++ b/lm_eval/tasks/wino_bias.py
-"""
-Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods
-https://arxiv.org/abs/1804.06876
-Winograd-schema evaluation of gendered coreference resolution.
-The dataset contains pro-stereotypical and anti-stereotypical parts. The difference in accuracy for those two subsets
-quatnifies bias.
-Homepage: https://uclanlp.github.io/corefBias/overview
-"""
-from lm_eval.base import PromptSourceTask, mean
-import transformers.data.metrics.squad_metrics as squad_metrics
-_CITATION = """
-@inproceedings{zhao-etal-2018-gender,
-    title = "Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods",
-    author = "Zhao, Jieyu  and
-      Wang, Tianlu  and
-      Yatskar, Mark  and
-      Ordonez, Vicente  and
-      Chang, Kai-Wei",
-    booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)",
-    month = jun,
-    year = "2018",
-    address = "New Orleans, Louisiana",
-    publisher = "Association for Computational Linguistics",
-    url = "https://aclanthology.org/N18-2003",
-    doi = "10.18653/v1/N18-2003",
-    pages = "15--20",
-    abstract = "In this paper, we introduce a new benchmark for co-reference resolution focused on gender bias, WinoBias. Our corpus contains Winograd-schema style sentences with entities corresponding to people referred by their occupation (e.g. the nurse, the doctor, the carpenter). We demonstrate that a rule-based, a feature-rich, and a neural coreference system all link gendered pronouns to pro-stereotypical entities with higher accuracy than anti-stereotypical entities, by an average difference of 21.1 in F1 score. Finally, we demonstrate a data-augmentation approach that, in combination with existing word-embedding debiasing techniques, removes the bias demonstrated by these systems in WinoBias without significantly affecting their performance on existing datasets.",
-}
-"""
-class WinoBias(PromptSourceTask):
-    VERSION = 0
-    DATASET_PATH = "wino_bias"
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return True
-    def training_docs(self):
-        pass
-    def validation_docs(self):
-        return self.dataset["validation"]
-    def test_docs(self):
-        return self.dataset["test"]
-    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a
-        dict where keys are the names of submetrics and values are the values of
-        the metric for that one document
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        target = self.doc_to_target(doc).strip()
-        pred = " ".join(results[0].strip().split(" ")[:len(target.split(" "))])
-        # The original paper uses F1. In the case of exactly one predicted and one gold mention,
-        # F1 and exact match are equivalent.
-        em = squad_metrics.compute_exact(target, pred)
-        out = {"em": em}
-        if self.save_examples:
-            example = {"target": target, "pred": pred}
-            return out, example
-        return out
-    def aggregation(self):
-        """
-        :returns: {str: [metric_score] -> float}
-            A dictionary where keys are the names of submetrics and values are
-            functions that aggregate a list of metric scores
-        """
-        return {'em': mean}
-    def higher_is_better(self):
-        return {'em': True}
-class WinoBiasType1Pro(WinoBias):
-    DATASET_NAME = "type1_pro"
-class WinoBiasType1Anti(WinoBias):
-    DATASET_NAME = "type1_anti"
-class WinoBiasType2Pro(WinoBias):
-    DATASET_NAME = "type2_pro"
-class WinoBiasType2Anti(WinoBias):
-    DATASET_NAME = "type2_anti"
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -146,19 +146,6 @@ class Reorderer:
        return res
-def flatten(d, parent_key='', sep='_'):
-    # From: https://stackoverflow.com/a/6027615
-    items = []
-    for k, v in d.items():
-        new_key = parent_key + sep + k if parent_key else k
-        if isinstance(v, collections.MutableMapping):
-            items.extend(flatten(v, new_key, sep=sep).items())
-        else:
-            items.append((new_key, v))
-    return dict(items)
 def positional_deprecated(fn):
    """
    A decorator to nudge users into passing only keyword args (`kwargs`) to the 

--- a/main.py
+++ b/main.py
@@ -9,29 +9,27 @@ logging.getLogger("openai").setLevel(logging.WARNING)
 def parse_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", required=True)
+    parser.add_argument('--model', required=True)
-    parser.add_argument("--model_args", default="")
+    parser.add_argument('--model_args', default="")
-    parser.add_argument("--tasks", default="all_tasks")
+    parser.add_argument('--tasks', default="all_tasks")
-    parser.add_argument("--provide_description", action="store_true")
+    parser.add_argument('--provide_description', action="store_true")
-    parser.add_argument("--num_fewshot", type=int, default=0)
+    parser.add_argument('--num_fewshot', type=int, default=0)
-    parser.add_argument("--batch_size", type=int, default=None)
+    parser.add_argument('--batch_size', type=int, default=None)
-    parser.add_argument("--device", type=str, default=None)
+    parser.add_argument('--device', type=str, default=None)
-    parser.add_argument("--output_path", default=None)
+    parser.add_argument('--output_path', default=None)
-    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument('--limit', type=int, default=None)
-    parser.add_argument("--no_cache", action="store_true")
+    parser.add_argument('--no_cache', action="store_true")
-    parser.add_argument("--description_dict_path", default=None)
+    parser.add_argument('--description_dict_path', default=None)
-    parser.add_argument("--check_integrity", action="store_true")
+    parser.add_argument('--check_integrity', action="store_true")
    return parser.parse_args()
 def main():
    args = parse_args()
    assert not args.provide_description  # not implemented
    if args.limit:
-        print(
+        print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
-            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
-        )
    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
@@ -40,7 +38,7 @@ def main():
    description_dict = {}
    if args.description_dict_path:
-        with open(args.description_dict_path, "r") as f:
+        with open(args.description_dict_path, 'r') as f:
            description_dict = json.load(f)
    results = evaluator.simple_evaluate(
@@ -53,12 +51,11 @@ def main():
        no_cache=args.no_cache,
        limit=args.limit,
        description_dict=description_dict,
-        check_integrity=args.check_integrity,
+        check_integrity=args.check_integrity
    )
-    print(results)
    dumped = json.dumps(results, indent=2)
    print(dumped)
    if args.output_path:

--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -11,14 +11,14 @@ EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
 def parse_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--output_base_path", required=True)
+    parser.add_argument('--output_base_path', required=True)
-    parser.add_argument("--tasks", default="all_tasks")
+    parser.add_argument('--tasks', default="all_tasks")
-    parser.add_argument("--provide_description", action="store_true")
+    parser.add_argument('--provide_description', action="store_true")
-    parser.add_argument("--sets", type=str, default="val")  # example: val,test
+    parser.add_argument('--sets', type=str, default="val") # example: val,test
-    parser.add_argument("--num_fewshot", type=int, default=1)
+    parser.add_argument('--num_fewshot', type=int, default=1)
-    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument("--num_examples", type=int, default=1)
+    parser.add_argument('--num_examples', type=int, default=1)
-    parser.add_argument("--description_dict_path", default=None)
+    parser.add_argument('--description_dict_path', default=None)
    return parser.parse_args()
@@ -30,11 +30,11 @@ def main():
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
-    task_dict = tasks.get_task_dict_promptsource(task_names)
+    task_dict = tasks.get_task_dict(task_names)
    description_dict = {}
    if args.description_dict_path:
-        with open(args.description_dict_path, "r") as f:
+        with open(args.description_dict_path, 'r') as f:
            description_dict = json.load(f)
    os.makedirs(args.output_base_path, exist_ok=True)
@@ -45,34 +45,26 @@ def main():
        iters = []
        for set in args.sets.split(","):
-            if set == "train" and task.has_training_docs():
+            if set == 'train' and task.has_training_docs():
                docs = task.training_docs()
-            if set == "val" and task.has_validation_docs():
+            if set == 'val' and task.has_validation_docs():
                docs = task.validation_docs()
-            if set == "test" and task.has_test_docs():
+            if set == 'test' and task.has_test_docs():
                docs = task.test_docs()
            iters.append(docs)
        docs = join_iters(iters)
-        description = (
+        description = description_dict[task_name] if description_dict and task_name in description_dict else ""
-            description_dict[task_name]
-            if description_dict and task_name in description_dict
-            else ""
-        )
-        task_name = task_name.replace("/", "_")
        with open(os.path.join(args.output_base_path, task_name), "w") as f:
-            for i, doc in (
+            for i, doc in zip(range(args.num_examples), docs) if args.num_examples > 0 else enumerate(docs):
-                zip(range(args.num_examples), docs)
-                if args.num_examples > 0
-                else enumerate(docs)
-            ):
                f.write(EXAMPLE_DIVIDER.format(i=i))
-                ctx, _ = task.fewshot_context(
+                ctx = task.fewshot_context(
                    doc=doc,
                    num_fewshot=args.num_fewshot,
                    rnd=rnd,
-                    description=description,
+                    description=description
                )
                f.write(ctx + "\n")

--- a/setup.py
+++ b/setup.py
@@ -18,18 +18,13 @@ setuptools.setup(
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ],
-    python_requires=">=3.6",
+    python_requires='>=3.6',
    install_requires=[
-        "promptsource @ git+https://github.com/bigscience-workshop/promptsource@eval-hackathon",
-        "wrapt",
-        "nltk",
-        "jinja2",
-        "black",
        "datasets==2.0.0",
        "click>=7.1",
        "scikit-learn>=0.24.1",
        "torch>=1.7",
-        "transformers>=4.16",
+        "transformers>=4.1",
        "sqlitedict==1.6.0",
        "pytablewriter==0.58.0",
        "sacrebleu==1.5.0",
@@ -45,7 +40,7 @@ setuptools.setup(
        "openai==0.6.4",
        "jieba==0.42.1",
        "nagisa==0.2.7",
-        "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
+        "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
    ],
    dependency_links=[
        "https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",

--- a/templates/new_task.py
+++ b/templates/new_task.py
@@ -16,8 +16,7 @@ _CITATION = """
 # TODO: Replace `NewTask` with the name of your Task.
+class NewTask(Task):
-class NewTask(PromptSourceTask):
    VERSION = 0
    # TODO: Add the `DATASET_PATH` string. This will be the name of the `Task`
    # dataset as denoted in HuggingFace `datasets`.
@@ -92,13 +91,6 @@ class NewTask(PromptSourceTask):
        target = ""
        return " " + target
-      def max_generation_length(self):
-        # Define this method when you want to control the length of few-shot
-        # generations on specific tokens. The default is `None` which gets mapped
-        # to a model's default max generation token length. E.g. see `lm_eval/models/gpt2.py:max_gen_toks()`
-        # NOTE: You may delete this function if the task does not required generation.
-        return None
    def construct_requests(self, doc, ctx):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.
@@ -146,4 +138,4 @@ class NewTask(PromptSourceTask):
        # TODO: For each (sub)metric in the task evaluation, add a key-value pair
        # with the metric name as key and a `bool` value determining whether or
        # not higher values of that metric are deemed better.
        return {}
\ No newline at end of file
--- a/tests/test_gpt2.py
+++ b/tests/test_gpt2.py
-import random
-import lm_eval.models as models
-import pytest
-import torch
-from transformers import StoppingCriteria
-@pytest.mark.parametrize(
-    "eos_token,test_input,expected", 
-    [
-        ("not", "i like", "i like to say that I'm not"), 
-        ("say that", "i like", "i like to say that"),
-        ("great", "big science is", "big science is a great"),
-        ("<|endoftext|>", "big science has", "big science has been done in the past, but it's not the same as the science of the")
-    ]
-)
-def test_stopping_criteria(eos_token, test_input, expected):
-    random.seed(42)
-    torch.random.manual_seed(42)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    gpt2 = models.get_model("gpt2")(device=device)
-    context = torch.tensor([gpt2.tokenizer.encode(test_input)])
-    stopping_criteria_ids = gpt2.tokenizer.encode(eos_token)
-    generations = gpt2._model_generate(
-        context,
-        max_length=20,
-        stopping_criteria_ids=stopping_criteria_ids
-    )
-    generations = gpt2.tokenizer.decode(generations[0])
-    assert generations == expected