fix for merge from master

baa8b0d3 · bzantium · a956bc63 · baa8b0d3 · baa8b0d3 · baa8b0d3
Commit baa8b0d3 authored May 18, 2023 by bzantium
20 changed files
--- a/lm_eval/tasks/blimp.py
+++ b/lm_eval/tasks/blimp.py
@@ -28,7 +28,7 @@ _CITATION = """
    eprint = {https://doi.org/10.1162/tacl_a_00321},
    abstract = { We introduce The Benchmark of Linguistic Minimal Pairs (BLiMP),1 a challenge set for evaluating the linguistic knowledge of language models (LMs) on major grammatical phenomena in English. BLiMP consists of 67 individual datasets, each containing 1,000 minimal pairs—that is, pairs of minimally different sentences that contrast in grammatical acceptability and isolate specific phenomenon in syntax, morphology, or semantics. We generate the data according to linguist-crafted grammar templates, and human aggregate agreement with the labels is 96.4\%. We evaluate n-gram, LSTM, and Transformer (GPT-2 and Transformer-XL) LMs by observing whether they assign a higher probability to the acceptable sentence in each minimal pair. We find that state-of-the-art models identify morphological contrasts related to agreement reliably, but they struggle with some subtle semantic and syntactic phenomena, such as negative polarity items and extraction islands. }
 }
-"""
+"""  # noqa: W605


 class BlimpTask(Task):
@@ -50,9 +50,13 @@ class BlimpTask(Task):
        # trained on this data.
        return self.dataset["train"]

-    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+    ):
        assert num_fewshot == 0
-        assert rnd is not None, "A `random.Random` generator argument must be provided to `rnd`"
+        assert (
+            rnd is not None
+        ), "A `random.Random` generator argument must be provided to `rnd`"
        assert not provide_description, (
            "The `provide_description` arg will be removed in future versions. To prepend "
            "a custom description to the context, supply the corresponding string via the  "
@@ -60,7 +64,9 @@ class BlimpTask(Task):
        )
        if provide_description is not None:
            # nudge people to not specify it at all
-            print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
+            print(
+                "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
+            )

        return ""

@@ -68,6 +74,12 @@ class BlimpTask(Task):
        # this method is invoked by tests only
        return ""

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["sentence_good"] + " " + doc["sentence_bad"]
+
    def doc_to_target(self, doc):
        # this method is invoked by tests only
        return ""

--- a/lm_eval/tasks/cbt.py
+++ b/lm_eval/tasks/cbt.py
@@ -75,11 +75,20 @@ class CBTBase(Task):
        text = "Passage: " + passage + "\nQuestion: " + doc["question"]
        return self.detokenize(text)

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        passage = " ".join(doc["sentences"])
+        return passage
+
    def doc_to_target(self, doc):
        return ""

    def fewshot_examples(self, k, rnd):
-        assert k == 0, f"CBT is only implemented for the zero-shot setting. Given k={k}."
+        assert (
+            k == 0
+        ), f"CBT is only implemented for the zero-shot setting. Given k={k}."
        return super().fewshot_examples(k, rnd)

    def construct_requests(self, doc, ctx):
@@ -113,9 +122,7 @@ class CBTBase(Task):
        """
        gold = doc["options"].index(doc["answer"])
        pred = np.argmax(results)
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def aggregation(self):
        """
@@ -123,9 +130,7 @@ class CBTBase(Task):
            A dictionary where keys are the names of submetrics and values are
            functions that aggregate a list of metrics
        """
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}

    def higher_is_better(self):
        """
@@ -133,9 +138,7 @@ class CBTBase(Task):
            A dictionary where keys are the names of submetrics and values are
            whether a higher value of the submetric is better
        """
-        return {
-            "acc": True
-        }
+        return {"acc": True}


 class CBTCN(CBTBase):

--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
@@ -54,13 +54,21 @@ class CoQA(Task):
    def doc_to_text(self, doc):
        # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1}
        # and a question qi, the task is to predict the answer ai
-        doc_text = doc["story"] + '\n\n'
-        for (q, a) in zip_longest(doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]):   # omit target answer ai
+        doc_text = doc["story"] + "\n\n"
+        for (q, a) in zip_longest(
+            doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]
+        ):  # omit target answer ai
            question = f"Q: {q}\n\n"
            answer = f"A: {a}\n\n" if a is not None else "A:"
            doc_text += question + answer
        return doc_text

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["story"] + " " + "\n".join(doc["questions"]["input_text"])
+
    @classmethod
    def get_answers(cls, doc, turn_id):
        # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
@@ -71,7 +79,9 @@ class CoQA(Task):
        additional_answers = doc.get("additional_answers")
        if additional_answers:
            for key in additional_answers:
-                additional_answer_for_turn = additional_answers[key]["input_text"][turn_id - 1]
+                additional_answer_for_turn = additional_answers[key]["input_text"][
+                    turn_id - 1
+                ]
                if additional_answer_for_turn.lower() not in map(str.lower, answers):
                    answers.append(additional_answer_for_turn)
        return answers
@@ -83,12 +93,12 @@ class CoQA(Task):
        # ~ 2/3 of the CoQA answers are span-based
        # (answers overlap with the passage ignoring punctuation and case mismatch)
        if raw_text == "unknown":
-            return '0'
+            return "0"
        if squad_metrics.normalize_answer(raw_text) == "yes":
-            return '1'
+            return "1"
        if squad_metrics.normalize_answer(raw_text) == "no":
-            return '2'
-        return '3' # Not a yes/no question
+            return "2"
+        return "3"  # Not a yes/no question

    @staticmethod
    def compute_scores(gold_list, pred):
@@ -98,25 +108,30 @@ class CoQA(Task):
        em_sum = 0.0
        if len(gold_list) > 1:
            for i in range(len(gold_list)):
-                gold_answers = gold_list[0:i] + gold_list[i + 1:]
+                gold_answers = gold_list[0:i] + gold_list[i + 1 :]
                # predictions compared against (n) golds and take maximum
-                em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
+                em_sum += max(
+                    squad_metrics.compute_exact(a, pred) for a in gold_answers
+                )
                f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers)
        else:
            em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
            f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list)

-        return {'em': em_sum / max(1, len(gold_list)), 'f1': f1_sum / max(1, len(gold_list))}
+        return {
+            "em": em_sum / max(1, len(gold_list)),
+            "f1": f1_sum / max(1, len(gold_list)),
+        }

    def doc_to_target(self, doc, turnid=None):
        # Default to prediction of last turn.
        if turnid is None:
            turnid = len(doc["questions"]["input_text"])
-        raw_text = doc['answers']["input_text"][turnid - 1]
+        raw_text = doc["answers"]["input_text"][turnid - 1]
        return " " + raw_text

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
@@ -126,7 +141,7 @@ class CoQA(Task):
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
        """
-        cont_request = rf.greedy_until(ctx, ['\nQ:'])
+        cont_request = rf.greedy_until(ctx, {"until": ["\nQ:"]})
        return cont_request

    def process_results(self, doc, results):
@@ -141,13 +156,13 @@ class CoQA(Task):
        """
        turn_id = len(doc["questions"]["input_text"])
        gold_list = self.get_answers(doc, turn_id)
-        pred = results[0].strip().split('\n')[0]
+        pred = results[0].strip().split("\n")[0]

        scores = self.compute_scores(gold_list, pred)

        return {
-            "f1": scores['f1'],
-            "em": scores['em'],
+            "f1": scores["f1"],
+            "em": scores["em"],
        }

    def higher_is_better(self):

--- a/lm_eval/tasks/crowspairs.py
+++ b/lm_eval/tasks/crowspairs.py
+"""
+CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models
+https://aclanthology.org/2020.emnlp-main.154/
+French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked
+language models to a language other than English
+https://aclanthology.org/2022.acl-long.583/
+
+CrowS-Pairs is a challenge set for evaluating what language models (LMs) on their tendency
+to generate biased outputs. CrowS-Pairs comes in 2 languages and the English subset has
+a newer version which fixes some of the issues with the original version.
+
+Homepage: https://github.com/nyu-mll/crows-pairs, https://gitlab.inria.fr/french-crows-pairs
+"""
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@inproceedings{nangia-etal-2020-crows,
+    title = "{C}row{S}-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models",
+    author = "Nangia, Nikita  and
+      Vania, Clara  and
+      Bhalerao, Rasika  and
+      Bowman, Samuel R.",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
+    month = nov,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2020.emnlp-main.154",
+    doi = "10.18653/v1/2020.emnlp-main.154",
+    pages = "1953--1967",
+    abstract = "Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.",
+}
+
+@inproceedings{neveol-etal-2022-french,
+    title = "{F}rench {C}row{S}-Pairs: Extending a challenge dataset for measuring social bias in masked language models to a language other than {E}nglish",
+    author = {N{\'e}v{\'e}ol, Aur{\'e}lie  and
+      Dupont, Yoann  and
+      Bezan{\c{c}}on, Julien  and
+      Fort, Kar{\"e}n},
+    booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+    month = may,
+    year = "2022",
+    address = "Dublin, Ireland",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.acl-long.583",
+    doi = "10.18653/v1/2022.acl-long.583",
+    pages = "8521--8531",
+    abstract = "Warning: This paper contains explicit statements of offensive stereotypes which may be upsetting.Much work on biases in natural language processing has addressed biases linked to the social and cultural experience of English speaking individuals in the United States. We seek to widen the scope of bias studies by creating material to measure social bias in language models (LMs) against specific demographic groups in France. We build on the US-centered CrowS-pairs dataset to create a multilingual stereotypes dataset that allows for comparability across languages while also characterizing biases that are specific to each country and language. We introduce 1,679 sentence pairs in French that cover stereotypes in ten types of bias like gender and age. 1,467 sentence pairs are translated from CrowS-pairs and 212 are newly crowdsourced. The sentence pairs contrast stereotypes concerning underadvantaged groups with the same sentence concerning advantaged groups. We find that four widely used language models (three French, one multilingual) favor sentences that express stereotypes in most bias categories. We report on the translation process from English into French, which led to a characterization of stereotypes in CrowS-pairs including the identification of US-centric cultural traits. We offer guidelines to further extend the dataset to other languages and cultural environments.",
+}
+"""  # noqa: W605
+
+
+class CrowsPairsMutilingual(Task):
+    VERSION = 0
+    DATASET_PATH = "BigScienceBiasEval/crows_pairs_multilingual"
+    BIAS_TYPE = None
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def validation_docs(self):
+        test_dataset = self.dataset["test"]
+        if self.BIAS_TYPE is not None:
+            test_dataset = test_dataset.filter(
+                lambda example: example["bias_type"].startswith(self.BIAS_TYPE)
+            )
+        return test_dataset
+
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+    ):
+        assert num_fewshot == 0
+        assert (
+            rnd is not None
+        ), "A `random.Random` generator argument must be provided to `rnd`"
+        assert not provide_description, (
+            "The `provide_description` arg will be removed in future versions. To prepend "
+            "a custom description to the context, supply the corresponding string via the  "
+            "`description` arg."
+        )
+        if provide_description is not None:
+            # nudge people to not specify it at all
+            print(
+                "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
+            )
+
+        return ""
+
+    def doc_to_text(self, doc):
+        # ignore prompts as we only score the model on the likelihood of the sentences
+        return ""
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["sentence_good"] + " " + doc["sentence_bad"]
+
+    def doc_to_target(self, doc):
+        # ignore prompts as we only score the model on the likelihood of the sentences
+        return ""
+
+    def construct_requests(self, doc, ctx):
+        assert not ctx
+
+        # Calculate the loglikelihood for the more and the less stereotypical sentence.
+        # Note that loglikelihood translates the "" prefix to the "<|endoftext|>" token
+        return [
+            rf.loglikelihood("", doc["sent_more"]),
+            rf.loglikelihood("", doc["sent_less"]),
+        ]
+
+    def process_results(self, doc, results):
+        likelihood1, likelihood2 = results
+
+        # Calculate the absolute difference in loglikelihoods
+        diff = abs(likelihood1[0] - likelihood2[0])
+
+        # if stereotypical sentence more likely (loglikelihood higher)
+        # then treat this as predicting stereotyped sentence
+        acc = 1.0 if likelihood1[0] > likelihood2[0] else 0.0
+
+        return {"likelihood_difference": diff, "pct_stereotype": acc}
+
+    def higher_is_better(self):
+        # For all metrics lower is better
+        return {"likelihood_difference": False, "pct_stereotype": True}
+
+    def aggregation(self):
+        return {"likelihood_difference": mean, "pct_stereotype": mean}
+
+
+class CrowsPairsEnglish(CrowsPairsMutilingual):
+    DATASET_NAME = "english"
+
+
+class CrowsPairsFrench(CrowsPairsMutilingual):
+    DATASET_NAME = "french"
+
+
+class CrowsPairsEnglishRaceColor(CrowsPairsMutilingual):
+    DATASET_NAME = "english"
+    BIAS_TYPE = "race-color"
+
+
+class CrowsPairsEnglishSocioeconomic(CrowsPairsMutilingual):
+    DATASET_NAME = "english"
+    BIAS_TYPE = "socioeconomic"
+
+
+class CrowsPairsEnglishGender(CrowsPairsMutilingual):
+    DATASET_NAME = "english"
+    BIAS_TYPE = "gender"
+
+
+class CrowsPairsEnglishAge(CrowsPairsMutilingual):
+    DATASET_NAME = "english"
+    BIAS_TYPE = "age"
+
+
+class CrowsPairsEnglishReligion(CrowsPairsMutilingual):
+    DATASET_NAME = "english"
+    BIAS_TYPE = "religion"
+
+
+class CrowsPairsEnglishDisability(CrowsPairsMutilingual):
+    DATASET_NAME = "english"
+    BIAS_TYPE = "disability"
+
+
+class CrowsPairsEnglishSexualOrientation(CrowsPairsMutilingual):
+    DATASET_NAME = "english"
+    BIAS_TYPE = "sexual-orientation"
+
+
+class CrowsPairsEnglishNationality(CrowsPairsMutilingual):
+    DATASET_NAME = "english"
+    BIAS_TYPE = "nationality"
+
+
+class CrowsPairsEnglishPhysicalAppearance(CrowsPairsMutilingual):
+    DATASET_NAME = "english"
+    BIAS_TYPE = "physical-appearance"
+
+
+class CrowsPairsEnglishAutre(CrowsPairsMutilingual):
+    DATASET_NAME = "english"
+    BIAS_TYPE = "autre"
+
+
+class CrowsPairsFrenchRaceColor(CrowsPairsMutilingual):
+    DATASET_NAME = "french"
+    BIAS_TYPE = "race-color"
+
+
+class CrowsPairsFrenchSocioeconomic(CrowsPairsMutilingual):
+    DATASET_NAME = "french"
+    BIAS_TYPE = "socioeconomic"
+
+
+class CrowsPairsFrenchGender(CrowsPairsMutilingual):
+    DATASET_NAME = "french"
+    BIAS_TYPE = "gender"
+
+
+class CrowsPairsFrenchAge(CrowsPairsMutilingual):
+    DATASET_NAME = "french"
+    BIAS_TYPE = "age"
+
+
+class CrowsPairsFrenchReligion(CrowsPairsMutilingual):
+    DATASET_NAME = "french"
+    BIAS_TYPE = "religion"
+
+
+class CrowsPairsFrenchDisability(CrowsPairsMutilingual):
+    DATASET_NAME = "french"
+    BIAS_TYPE = "disability"
+
+
+class CrowsPairsFrenchSexualOrientation(CrowsPairsMutilingual):
+    DATASET_NAME = "french"
+    BIAS_TYPE = "sexual-orientation"
+
+
+class CrowsPairsFrenchNationality(CrowsPairsMutilingual):
+    DATASET_NAME = "french"
+    BIAS_TYPE = "nationality"
+
+
+class CrowsPairsFrenchPhysicalAppearance(CrowsPairsMutilingual):
+    DATASET_NAME = "french"
+    BIAS_TYPE = "physical-appearance"
+
+
+class CrowsPairsFrenchAutre(CrowsPairsMutilingual):
+    DATASET_NAME = "french"
+    BIAS_TYPE = "autre"
--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -70,21 +70,26 @@ class DROP(Task):
    @classmethod
    def get_answers(cls, qa):
        def _flatten_validated_answers(validated_answers):
-            """ Flattens a dict of lists of validated answers.
+            """Flattens a dict of lists of validated answers.
            {"number": ['1', '8'], ...}
            -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
            """
-            vas = []
+            valid_answers = []
            for i in range(len(validated_answers["number"])):
-                vas.append({
+                valid_answers.append(
+                    {
                        "number": validated_answers["number"][i],
                        "date": validated_answers["date"][i],
                        "spans": validated_answers["spans"][i],
-                })
-            return vas
+                    }
+                )
+            return valid_answers
+
        answers = []
        answers_set = set()
-        candidates = [qa["answer"]] + _flatten_validated_answers(qa["validated_answers"])
+        candidates = [qa["answer"]] + _flatten_validated_answers(
+            qa["validated_answers"]
+        )
        for candidate in candidates:
            answer = cls.parse_answer(candidate)
            if answer in answers_set:
@@ -100,13 +105,21 @@ class DROP(Task):
            return (str(answer["number"]),)
        if answer["spans"] != []:
            return tuple(answer["spans"])
-        return (" ".join([answer["date"]["day"],
-                          answer["date"]["month"],
-                          answer["date"]["year"]]).strip(),)
+        return (
+            " ".join(
+                [answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]]
+            ).strip(),
+        )

    def doc_to_text(self, doc):
        return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["passage"] + " " + doc["question"]
+
    def doc_to_target(self, doc):
        return " " + ", ".join(doc["answers"][0])

@@ -121,7 +134,7 @@ class DROP(Task):
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
        """
-        conts = [rf.greedy_until(ctx, ["."])]
+        conts = [rf.greedy_until(ctx, {"until": ["."]})]
        return conts

    def process_results(self, doc, results):
@@ -142,10 +155,7 @@ class DROP(Task):
            if gold_answer[0].strip():
                max_em = max(max_em, exact_match)
                max_f1 = max(max_f1, f1_score)
-        return {
-            "em": max_em,
-            "f1": max_f1
-        }
+        return {"em": max_em, "f1": max_f1}

    def get_metrics(self, predicted, gold):
        """
@@ -158,7 +168,9 @@ class DROP(Task):
        predicted_bags = self._answer_to_bags(predicted)
        gold_bags = self._answer_to_bags(gold)

-        if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
+        if set(predicted_bags[0]) == set(gold_bags[0]) and len(
+            predicted_bags[0]
+        ) == len(gold_bags[0]):
            exact_match = 1.0
        else:
            exact_match = 0.0
@@ -190,7 +202,9 @@ class DROP(Task):
        for gold_index, gold_item in enumerate(gold):
            for pred_index, pred_item in enumerate(predicted):
                if self._match_numbers_if_present(gold_item, pred_item):
-                    scores[gold_index, pred_index] = self._compute_f1(pred_item, gold_item)
+                    scores[gold_index, pred_index] = self._compute_f1(
+                        pred_item, gold_item
+                    )
        row_ind, col_ind = linear_sum_assignment(-scores)

        max_scores = np.zeros([max(len(gold), len(predicted))])
@@ -256,7 +270,11 @@ class DROP(Task):

    def _normalize(self, answer):
        tokens = [
-            self._white_space_fix(self._remove_articles(self._fix_number(self._remove_punc(token.lower()))))
+            self._white_space_fix(
+                self._remove_articles(
+                    self._fix_number(self._remove_punc(token.lower()))
+                )
+            )
            for token in self._tokenize(answer)
        ]
        tokens = [token for token in tokens if token.strip()]
@@ -269,10 +287,7 @@ class DROP(Task):
            A dictionary where keys are the names of submetrics and values are
            functions that aggregate a list of metrics
        """
-        return {
-            "em": mean,
-            "f1": mean
-        }
+        return {"em": mean, "f1": mean}

    def higher_is_better(self):
        """
@@ -280,7 +295,4 @@ class DROP(Task):
            A dictionary where keys are the names of submetrics and values are
            whether a higher value of the submetric is better
        """
-        return {
-            "em": True,
-            "f1": True
-        }
+        return {"em": True, "f1": True}
--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -68,7 +68,15 @@ class CoLA(Task):
        return self.dataset["validation"]

    def doc_to_text(self, doc):
-        return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])
+        return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(
+            doc["sentence"]
+        )
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["sentence"]

    def doc_to_target(self, doc):
        return " {}".format({1: "yes", 0: "no"}[doc["label"]])
@@ -82,19 +90,13 @@ class CoLA(Task):
        ll_true, ll_false = results
        pred = ll_true > ll_false
        gold = doc["label"]
-        return {
-            "mcc": (gold, pred)
-        }
+        return {"mcc": (gold, pred)}

    def higher_is_better(self):
-        return {
-            "mcc": True
-        }
+        return {"mcc": True}

    def aggregation(self):
-        return {
-            "mcc": matthews_corrcoef
-        }
+        return {"mcc": matthews_corrcoef}


 class SST(Task):
@@ -136,19 +138,13 @@ class SST(Task):
        ll_positive, ll_negative = results
        pred = ll_positive > ll_negative
        gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}

    def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}


 # Inference Tasks
@@ -184,7 +180,8 @@ class MNLI(Task):
    def doc_to_text(self, doc):
        return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
            doc["premise"],
-            doc["hypothesis"].strip() + ('' if doc["hypothesis"].strip().endswith('.') else '.'),
+            doc["hypothesis"].strip()
+            + ("" if doc["hypothesis"].strip().endswith(".") else "."),
        )

    def doc_to_target(self, doc):
@@ -202,19 +199,13 @@ class MNLI(Task):
    def process_results(self, doc, results):
        gold = doc["label"]
        pred = np.argmax(results)
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}

    def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}


 class MNLIMismatched(MNLI):
@@ -252,10 +243,12 @@ class QNLI(Task):
        return self.dataset["validation"]

    def doc_to_text(self, doc):
-        return "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
+        return (
+            "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
                doc["question"],
                doc["sentence"],
            )
+        )

    def doc_to_target(self, doc):
        # True = entailment
@@ -271,19 +264,13 @@ class QNLI(Task):
        ll_yes, ll_no = results
        pred = ll_no > ll_yes
        gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}

    def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}


 class WNLI(Task):
@@ -328,19 +315,13 @@ class WNLI(Task):
        ll_true, ll_false = results
        pred = ll_true > ll_false
        gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}

    def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}


 class RTE(Task):
@@ -385,19 +366,13 @@ class RTE(Task):
        ll_true, ll_false = results
        pred = ll_false > ll_true
        gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}

    def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}


 # Similarity and Paraphrase Tasks
@@ -449,16 +424,10 @@ class MRPC(Task):
        }

    def higher_is_better(self):
-        return {
-            "acc": True,
-            "f1": True
-        }
+        return {"acc": True, "f1": True}

    def aggregation(self):
-        return {
-            "acc": mean,
-            "f1": f1_score
-        }
+        return {"acc": mean, "f1": f1_score}


 class QQP(Task):
@@ -507,16 +476,10 @@ class QQP(Task):
        }

    def higher_is_better(self):
-        return {
-            "acc": True,
-            "f1": True
-        }
+        return {"acc": True, "f1": True}

    def aggregation(self):
-        return {
-            "acc": mean,
-            "f1": f1_score
-        }
+        return {"acc": mean, "f1": f1_score}


 class STSB(Task):
@@ -554,7 +517,7 @@ class STSB(Task):
        return " {}".format(doc["label"])

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
@@ -565,7 +528,7 @@ class STSB(Task):
            part of the document for `doc`.
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")

    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a
@@ -578,7 +541,7 @@ class STSB(Task):
            The results of the requests created in construct_requests.
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")

    def aggregation(self):
        """
@@ -587,7 +550,7 @@ class STSB(Task):
            functions that aggregate a list of metrics
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")

    def higher_is_better(self):
        """
@@ -596,4 +559,4 @@ class STSB(Task):
            whether a higher value of the submetric is better
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
--- a/lm_eval/tasks/gsm8k.py
+++ b/lm_eval/tasks/gsm8k.py
@@ -16,10 +16,7 @@ model's sample/generation function.

 Homepage: https://github.com/openai/grade-school-math
 """
-import inspect
 import re
-import lm_eval.datasets.gsm8k.gsm8k
-from pathlib import Path
 from lm_eval.base import Task, rf
 from lm_eval.metrics import mean

@@ -42,8 +39,8 @@ INVALID_ANS = "[invalid]"

 class GradeSchoolMath8K(Task):
    VERSION = 0
-    DATASET_PATH = inspect.getfile(lm_eval.datasets.gsm8k.gsm8k)
-    DATASET_NAME = None
+    DATASET_PATH = "gsm8k"
+    DATASET_NAME = "main"

    def has_training_docs(self):
        return True
@@ -64,13 +61,13 @@ class GradeSchoolMath8K(Task):
        return self.dataset["test"]

    def doc_to_text(self, doc):
-        return "Question: " + doc['question'] + '\nAnswer:'
+        return "Question: " + doc["question"] + "\nAnswer:"

    def doc_to_target(self, doc):
-        return " " + doc['answer']
+        return " " + doc["answer"]

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
@@ -82,7 +79,7 @@ class GradeSchoolMath8K(Task):
        """
        # NOTE: The paper implements "verifiers" that assign a score to multiple
        # solutions and output the highest ranked solution.
-        completion = rf.greedy_until(ctx, ['\n'])
+        completion = rf.greedy_until(ctx, {"until": [":", "Question:", "Question"]})
        return completion

    def _extract_answer(self, completion):
@@ -111,9 +108,7 @@ class GradeSchoolMath8K(Task):
        """
        completion = results[0]
        answer = doc["answer"]
-        return {
-            "acc": self._is_correct(completion, answer)
-        }
+        return {"acc": self._is_correct(completion, answer)}

    def aggregation(self):
        """
@@ -121,9 +116,7 @@ class GradeSchoolMath8K(Task):
            A dictionary where keys are the names of submetrics and values are
            functions that aggregate a list of metrics
        """
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}

    def higher_is_better(self):
        """
@@ -131,6 +124,4 @@ class GradeSchoolMath8K(Task):
            A dictionary where keys are the names of submetrics and values are
            whether a higher value of the submetric is better
        """
-        return {
-            "acc": True
-        }
+        return {"acc": True}
--- a/lm_eval/tasks/headqa.py
+++ b/lm_eval/tasks/headqa.py
@@ -61,6 +61,12 @@ class HeadQABase(MultipleChoiceTask):
    def doc_to_text(self, doc):
        return doc["query"]

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
+

 class HeadQAEn(HeadQABase):
    DATASET_NAME = "en"
@@ -76,4 +82,6 @@ class HeadQAEsDeprecated(HeadQABase):

    def __init__(self):
        super().__init__()
-        print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
\ No newline at end of file
+        print(
+            "WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info."
+        )
--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
@@ -52,9 +52,9 @@ class HellaSwag(MultipleChoiceTask):
    def _process_doc(self, doc):
        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
        out_doc = {
-            "query": self.preprocess(doc['activity_label'] + ': ' + ctx),
-            "choices": [self.preprocess(ending) for ending in doc['endings']],
-            "gold": int(doc['label']),
+            "query": self.preprocess(doc["activity_label"] + ": " + ctx),
+            "choices": [self.preprocess(ending) for ending in doc["endings"]],
+            "gold": int(doc["label"]),
        }
        return out_doc

@@ -63,9 +63,15 @@ class HellaSwag(MultipleChoiceTask):
        text = text.strip()
        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
        text = text.replace(" [title]", ". ")
-        text = re.sub('\\[.*?\\]', '', text)
+        text = re.sub("\\[.*?\\]", "", text)
        text = text.replace("  ", " ")
        return text

    def doc_to_text(self, doc):
        return doc["query"]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
--- a/lm_eval/tasks/hendrycks_ethics.py
+++ b/lm_eval/tasks/hendrycks_ethics.py
@@ -10,7 +10,7 @@ to steer chatbot outputs or eventually regularize open-ended reinforcement
 learning agents.

 NOTE: The reported "group" accuracies for the Deontology, Justice, and Virtue
-tasks are refered to in this work as the `em` sub-metric. See Section 3. Metrics.
+tasks are referred to in this work as the `em` sub-metric. See Section 3. Metrics.
 of the paper.

 Homepage: https://github.com/hendrycks/ethics
@@ -90,6 +90,12 @@ class EthicsCM(Ethics):
    def doc_to_text(self, doc):
        return "{}\nQuestion: Is this wrong?\nAnswer:".format(doc["input"])

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["input"]
+
    def doc_to_target(self, doc):
        return " {}".format(yesno(int(doc["label"])))

@@ -102,19 +108,13 @@ class EthicsCM(Ethics):
        ll_yes, ll_no = results
        pred = ll_yes > ll_no
        gold = bool(int(doc["label"]))
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}

    def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}


 class EthicsDeontology(Ethics):
@@ -123,7 +123,15 @@ class EthicsDeontology(Ethics):

    def doc_to_text(self, doc):
        prompt = " ".join([doc["scenario"], doc["excuse"]])
-        return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(prompt)
+        return 'Question: Would most people believe this reasonable or unreasonable to say? "{}"\nAnswer:'.format(
+            prompt
+        )
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return " ".join([doc["scenario"], doc["excuse"]])

    def doc_to_target(self, doc):
        target = ["unreasonable", "reasonable"][int(doc["label"])]
@@ -137,30 +145,27 @@ class EthicsDeontology(Ethics):
    def process_results(self, doc, results):
        pred = np.argmax(results)
        gold = bool(int(doc["label"]))
-        return {
-            "acc": pred == gold,
-            "em": [doc["group_id"], pred == gold]
-        }
+        return {"acc": pred == gold, "em": [doc["group_id"], pred == gold]}

    def calc_em(self, items):
        # Calculate exact matches - i.e. all in a pair of 4 are correct
        # NOTE: `items` is a tuple of (doc["group_id"], is_correct)
        preds_sort = sorted(items, key=lambda x: x[0])
-        em_sums = [int(preds_sort[4*i][1]) + int(preds_sort[4*i+1][1]) + int(preds_sort[4*i+2][1]) + int(preds_sort[4*i+3][1]) for i in range(len(preds_sort) // 4)]
+        em_sums = [
+            int(preds_sort[4 * i][1])
+            + int(preds_sort[4 * i + 1][1])
+            + int(preds_sort[4 * i + 2][1])
+            + int(preds_sort[4 * i + 3][1])
+            for i in range(len(preds_sort) // 4)
+        ]
        em_cors = [em_sums[i] == 4 for i in range(len(em_sums))]
        return mean(em_cors)

    def aggregation(self):
-        return {
-            'acc': mean,
-            'em': self.calc_em
-        }
+        return {"acc": mean, "em": self.calc_em}

    def higher_is_better(self):
-        return {
-            'acc': True,
-            'em': True
-        }
+        return {"acc": True, "em": True}


 class EthicsJustice(Ethics):
@@ -168,7 +173,15 @@ class EthicsJustice(Ethics):
    DATASET_NAME = "justice"

    def doc_to_text(self, doc):
-        return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(doc["scenario"])
+        return 'Question: Would most people believe this reasonable or unreasonable to say? "{}"\nAnswer:'.format(
+            doc["scenario"]
+        )
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["scenario"]

    def doc_to_target(self, doc):
        target = ["unreasonable", "reasonable"][int(doc["label"])]
@@ -182,30 +195,27 @@ class EthicsJustice(Ethics):
    def process_results(self, doc, results):
        pred = np.argmax(results)
        gold = bool(int(doc["label"]))
-        return {
-            "acc": pred == gold,
-            "em": [doc["group_id"], pred == gold]
-        }
+        return {"acc": pred == gold, "em": [doc["group_id"], pred == gold]}

    def calc_em(self, items):
        # Calculate exact matches - i.e. all in a pair of 4 are correct
        # NOTE: `items` is a tuple of (doc["group_id"], is_correct)
        preds_sort = sorted(items, key=lambda x: x[0])
-        em_sums = [int(preds_sort[4*i][1]) + int(preds_sort[4*i+1][1]) + int(preds_sort[4*i+2][1]) + int(preds_sort[4*i+3][1]) for i in range(len(preds_sort) // 4)]
+        em_sums = [
+            int(preds_sort[4 * i][1])
+            + int(preds_sort[4 * i + 1][1])
+            + int(preds_sort[4 * i + 2][1])
+            + int(preds_sort[4 * i + 3][1])
+            for i in range(len(preds_sort) // 4)
+        ]
        em_cors = [em_sums[i] == 4 for i in range(len(em_sums))]
        return mean(em_cors)

    def aggregation(self):
-        return {
-            'acc': mean,
-            'em': self.calc_em
-        }
+        return {"acc": mean, "em": self.calc_em}

    def higher_is_better(self):
-        return {
-            'acc': True,
-            'em': True
-        }
+        return {"acc": True, "em": True}


 class EthicsUtilitarianismOriginal(Ethics):
@@ -218,20 +228,37 @@ class EthicsUtilitarianismOriginal(Ethics):

    def fewshot_examples(self, k, rnd):
        # Overwriting fewshot examples as k can be max 5
-        assert k <= 5, "There are only 5 possible shots for this task. Refer to the V2 for more."
+        assert (
+            k <= 5
+        ), "There are only 5 possible shots for this task. Refer to the V2 for more."
        # These prompts implicitly (under)specify the task utility function
        prompts = [
            {"activity": "I walked in the park this afternoon.", "rating": "7"},
            {"activity": "I walked in the parking lot this afternoon.", "rating": "5"},
-            {"activity": "I stopped to eat at a fast food restaurant. The food was cold.", "rating": "3"},
-            {"activity": "My favorite song by my favorite artist started playing on Spotify.", "rating": "8"},
-            {"activity": "I tripped and fell down the stairs at school. Several classmates made fun of me for falling.", "rating": "2"},
+            {
+                "activity": "I stopped to eat at a fast food restaurant. The food was cold.",
+                "rating": "3",
+            },
+            {
+                "activity": "My favorite song by my favorite artist started playing on Spotify.",
+                "rating": "8",
+            },
+            {
+                "activity": "I tripped and fell down the stairs at school. Several classmates made fun of me for falling.",
+                "rating": "2",
+            },
        ]
        return rnd.sample(prompts, k)

    def doc_to_text(self, doc):
        return 'Activity: "{}"\nRating:'.format(doc["activity"])

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["activity"]
+
    def doc_to_target(self, doc):
        return " " + doc["rating"]

@@ -258,14 +285,10 @@ class EthicsUtilitarianismOriginal(Ethics):
        }

    def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}

    def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}


 class EthicsUtilitarianism(Ethics):
@@ -273,24 +296,23 @@ class EthicsUtilitarianism(Ethics):
    This is a variation of the original Utilitarianism task used in the paper, where the situations are directly compared.
    This allows scaling to >5 shots.
    """
+
    VERSION = 0
    DATASET_NAME = "utilitarianism"

    def training_docs(self):
-        rnd = random.Random()
        for doc in self.dataset["train"]:
-            yield self._process_doc(doc, rnd)
+            yield self._process_doc(doc)

    def validation_docs(self):
        raise NotImplementedError

    def test_docs(self):
-        rnd = random.Random()
        for doc in self.dataset["test"]:
-            yield self._process_doc(doc, rnd)
+            yield self._process_doc(doc)

-    def _process_doc(self, doc, rnd):
-        rnd.seed(doc["activity"])
+    def _process_doc(self, doc):
+        rnd = random.Random(doc["activity"])
        scenarios = [doc["activity"], doc["baseline"]]
        ordering = [0, 1]
        rnd.shuffle(ordering)
@@ -301,7 +323,7 @@ class EthicsUtilitarianism(Ethics):
        }

    def doc_to_text(self, doc):
-        return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferrable?\nAnswer:".format(
+        return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferable?\nAnswer:".format(
            doc["scenarios"][0], doc["scenarios"][1]
        )

@@ -317,19 +339,13 @@ class EthicsUtilitarianism(Ethics):
        ll_yes, ll_no = results
        pred = ll_yes > ll_no
        gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}

    def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}


 class EthicsVirtue(Ethics):
@@ -340,9 +356,8 @@ class EthicsVirtue(Ethics):
        return doc

    def doc_to_text(self, doc):
-        return "Sentence: {}\nQuestion: Does the character in this sentence exhibit the trait \"{}\"?\nAnswer:".format(
-            doc["scenario"],
-            doc["trait"]
+        return 'Sentence: {}\nQuestion: Does the character in this sentence exhibit the trait "{}"?\nAnswer:'.format(
+            doc["scenario"], doc["trait"]
        )

    def doc_to_target(self, doc):
@@ -357,27 +372,25 @@ class EthicsVirtue(Ethics):
        ll_yes, ll_no = results
        pred = ll_yes > ll_no
        gold = bool(int(doc["label"]))
-        return {
-            "acc": pred == gold,
-            "em": [doc["group_id"], pred == gold]
-        }
+        return {"acc": pred == gold, "em": [doc["group_id"], pred == gold]}

    def calc_em(self, items):
        # Calculate exact matches - i.e. all in a pair of 5 are correct
        # NOTE: `items` is a tuple of (doc["group_id"], is_correct)
        preds_sort = sorted(items, key=lambda x: x[0])
-        em_sums = [int(preds_sort[5*i][1]) + int(preds_sort[5*i+1][1]) + int(preds_sort[5*i+2][1]) + int(preds_sort[5*i+3][1]) + int(preds_sort[5*i+4][1]) for i in range(len(preds_sort) // 5)]
+        em_sums = [
+            int(preds_sort[5 * i][1])
+            + int(preds_sort[5 * i + 1][1])
+            + int(preds_sort[5 * i + 2][1])
+            + int(preds_sort[5 * i + 3][1])
+            + int(preds_sort[5 * i + 4][1])
+            for i in range(len(preds_sort) // 5)
+        ]
        em_cors = [em_sums[i] == 5 for i in range(len(em_sums))]
        return mean(em_cors)

    def aggregation(self):
-        return {
-            'acc': mean,
-            'em': self.calc_em
-        }
+        return {"acc": mean, "em": self.calc_em}

    def higher_is_better(self):
-        return {
-            'acc': True,
-            'em': True
-        }
+        return {"acc": True, "em": True}
--- a/lm_eval/tasks/hendrycks_math.py
+++ b/lm_eval/tasks/hendrycks_math.py
@@ -38,27 +38,32 @@ class Math(Task):
        return True

    def training_docs(self):
-        return map(self._load_doc, self.dataset["train"])
+        return map(self._process_doc, self.dataset["train"])

    def validation_docs(self):
        return NotImplemented

    def test_docs(self):
-        return map(self._load_doc, self.dataset["test"])
+        return map(self._process_doc, self.dataset["test"])

-    def _load_doc(self, doc):
-        doc["answer"] = self.remove_boxed(
-            self.last_boxed_only_string(doc["solution"]))
+    def _process_doc(self, doc):
+        doc["answer"] = self.remove_boxed(self.last_boxed_only_string(doc["solution"]))
        return doc

    def doc_to_text(self, doc):
        return "Problem: " + doc["problem"] + "\nAnswer:"

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["problem"]
+
    def doc_to_target(self, doc):
        return " " + doc["solution"]

    def construct_requests(self, doc, ctx):
-        return rf.greedy_until(ctx, ["\n"])
+        return rf.greedy_until(ctx, {"until": ["\n"]})

    def process_results(self, doc, results):
        retval = 0
@@ -66,23 +71,19 @@ class Math(Task):
        if len(indices) <= 1:
            answer = results[0]
        else:
-            answer = results[0][indices[0]+1:indices[-1]]
+            answer = results[0][indices[0] + 1 : indices[-1]]

-        if self.is_equiv(answer, self.remove_boxed(self.last_boxed_only_string(doc["solution"]))):
+        if self.is_equiv(
+            answer, self.remove_boxed(self.last_boxed_only_string(doc["solution"]))
+        ):
            retval = 1
-        return {
-            "acc": retval
-        }
+        return {"acc": retval}

    def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}

    def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}

    def is_equiv(self, str1, str2, verbose=False):
        if str1 is None and str2 is None:
@@ -97,21 +98,21 @@ class Math(Task):
            if verbose:
                print(ss1, ss2)
            return ss1 == ss2
-        except:
+        except Exception:
            return str1 == str2

    def remove_boxed(self, s):
        if "\\boxed " in s:
            left = "\\boxed "
-            assert s[:len(left)] == left
-            return s[len(left):]
+            assert s[: len(left)] == left
+            return s[len(left) :]

        left = "\\boxed{"

-        assert s[:len(left)] == left
+        assert s[: len(left)] == left
        assert s[-1] == "}"

-        return s[len(left):-1]
+        return s[len(left) : -1]

    def last_boxed_only_string(self, string):

@@ -139,7 +140,7 @@ class Math(Task):
        if right_brace_idx is None:
            retval = None
        else:
-            retval = string[idx:right_brace_idx + 1]
+            retval = string[idx : right_brace_idx + 1]

        return retval

@@ -245,7 +246,7 @@ class Math(Task):

        # remove percentage
        string = string.replace("\\%", "")
-        string = string.replace("\%", "")
+        string = string.replace("\%", "")  # noqa: W605

        # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
        string = string.replace(" .", " 0.")
@@ -282,34 +283,34 @@ class Math(Task):

 class MathAlgebra(Math):
    VERSION = 1
-    DATASET_NAME = 'algebra'
+    DATASET_NAME = "algebra"


 class MathCountingAndProbability(Math):
    VERSION = 1
-    DATASET_NAME = 'counting_and_probability'
+    DATASET_NAME = "counting_and_probability"


 class MathGeometry(Math):
    VERSION = 1
-    DATASET_NAME = 'geometry'
+    DATASET_NAME = "geometry"


 class MathIntermediateAlgebra(Math):
    VERSION = 1
-    DATASET_NAME = 'intermediate_algebra'
+    DATASET_NAME = "intermediate_algebra"


 class MathNumberTheory(Math):
    VERSION = 1
-    DATASET_NAME = 'number_theory'
+    DATASET_NAME = "number_theory"


 class MathPrealgebra(Math):
    VERSION = 1
-    DATASET_NAME = 'prealgebra'
+    DATASET_NAME = "prealgebra"


 class MathPrecalculus(Math):
    VERSION = 1
-    DATASET_NAME = 'precalculus'
+    DATASET_NAME = "precalculus"
--- a/lm_eval/tasks/hendrycks_test.py
+++ b/lm_eval/tasks/hendrycks_test.py
@@ -25,16 +25,65 @@ _CITATION = """
 """


-SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
-            'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
-            'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
-            'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
-            'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics',
-            'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics',
-            'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence',
-            'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes',
-            'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
-            'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
+SUBJECTS = [
+    "abstract_algebra",
+    "anatomy",
+    "astronomy",
+    "business_ethics",
+    "clinical_knowledge",
+    "college_biology",
+    "college_chemistry",
+    "college_computer_science",
+    "college_mathematics",
+    "college_medicine",
+    "college_physics",
+    "computer_security",
+    "conceptual_physics",
+    "econometrics",
+    "electrical_engineering",
+    "elementary_mathematics",
+    "formal_logic",
+    "global_facts",
+    "high_school_biology",
+    "high_school_chemistry",
+    "high_school_computer_science",
+    "high_school_european_history",
+    "high_school_geography",
+    "high_school_government_and_politics",
+    "high_school_macroeconomics",
+    "high_school_mathematics",
+    "high_school_microeconomics",
+    "high_school_physics",
+    "high_school_psychology",
+    "high_school_statistics",
+    "high_school_us_history",
+    "high_school_world_history",
+    "human_aging",
+    "human_sexuality",
+    "international_law",
+    "jurisprudence",
+    "logical_fallacies",
+    "machine_learning",
+    "management",
+    "marketing",
+    "medical_genetics",
+    "miscellaneous",
+    "moral_disputes",
+    "moral_scenarios",
+    "nutrition",
+    "philosophy",
+    "prehistory",
+    "professional_accounting",
+    "professional_law",
+    "professional_medicine",
+    "professional_psychology",
+    "public_relations",
+    "security_studies",
+    "sociology",
+    "us_foreign_policy",
+    "virology",
+    "world_religions",
+]


 def create_all_tasks():
@@ -42,15 +91,14 @@ def create_all_tasks():
    :return: {task_name: task}
        e.g. {hendrycksTest-abstract_algebra: Task, hendrycksTest-anatomy: Task}
    """
-    return {
-        f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS
-    }
+    return {f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS}


 def create_task(subject):
    class HendrycksTest(GeneralHendrycksTest):
        def __init__(self):
            super().__init__(subject)
+
    return HendrycksTest


@@ -90,14 +138,19 @@ class GeneralHendrycksTest(MultipleChoiceTask):
            Answer:
            """
            prompt = "Question: " + doc["question"] + "\nChoices:\n"
-            prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])])
+            prompt += "".join(
+                [f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])]
+            )
            prompt += "Answer:"
            return prompt
-        keys = ['A', 'B', 'C', 'D']
+
+        keys = ["A", "B", "C", "D"]
        return {
            "query": format_example(doc, keys),
            "choices": doc["choices"],
-            "gold": keys.index(doc["answer"]) if isinstance(doc["answer"], str) else doc["answer"]
+            "gold": keys.index(doc["answer"])
+            if isinstance(doc["answer"], str)
+            else doc["answer"],
        }

    def fewshot_examples(self, k, rnd):
@@ -111,3 +164,9 @@ class GeneralHendrycksTest(MultipleChoiceTask):

    def doc_to_text(self, doc):
        return doc["query"]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
@@ -12,8 +12,6 @@ in the broader discourse.

 Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
 """
-import inspect
-import lm_eval.datasets.lambada.lambada
 from lm_eval.base import Task, rf
 from lm_eval.metrics import mean, perplexity

@@ -30,33 +28,32 @@ _CITATION = """
 """


-class LAMBADA(Task):
-    VERSION = 0
-    DATASET_PATH = inspect.getfile(lm_eval.datasets.lambada.lambada)
-
-    def has_training_docs(self):
-        return False
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return False
+class LambadaBase(Task):
+    VERSION = None

    def training_docs(self):
-        pass
+        if self.has_training_docs():
+            return self.dataset["train"]

    def validation_docs(self):
+        if self.has_validation_docs():
            return self.dataset["validation"]

    def test_docs(self):
-        pass
+        if self.has_test_docs():
+            return self.dataset["test"]

    def doc_to_text(self, doc):
-        return doc['text'].rsplit(' ', 1)[0]
+        return doc["text"].rsplit(" ", 1)[0]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["text"]

    def doc_to_target(self, doc):
-        return " " + doc['text'].rsplit(' ', 1)[1]
+        return " " + doc["text"].rsplit(" ", 1)[1]

    def construct_requests(self, doc, ctx):
        ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
@@ -66,19 +63,46 @@ class LAMBADA(Task):
    def process_results(self, doc, results):
        ll, is_greedy = results

-        return {
-            'ppl': ll,
-            'acc': int(is_greedy)
-        }
+        return {"ppl": ll, "acc": int(is_greedy)}

    def aggregation(self):
-        return {
-            'ppl': perplexity,
-            'acc': mean
-        }
+        return {"ppl": perplexity, "acc": mean}

    def higher_is_better(self):
-        return {
-            'ppl': False,
-            'acc': True
-        }
+        return {"ppl": False, "acc": True}
+
+
+class LambadaStandard(LambadaBase):
+    """The LAMBADA task using the standard original LAMBADA dataset."""
+
+    VERSION = 0
+    DATASET_PATH = "lambada"
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+
+class LambadaOpenAI(LambadaBase):
+    """The LAMBADA task using the LAMBADA OpenAI dataset, a modified version of the
+    original LAMBADA dataset created by OpenAI for evaluating their GPT-2 model.
+
+    Reference: https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
+    """
+
+    VERSION = 0
+    DATASET_PATH = "EleutherAI/lambada_openai"
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
--- a/lm_eval/tasks/lambada_cloze.py
+++ b/lm_eval/tasks/lambada_cloze.py
@@ -13,7 +13,7 @@ in the broader discourse.

 Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
 """
-from lm_eval.tasks.lambada import LAMBADA
+from lm_eval.tasks.lambada import LambadaOpenAI, LambadaStandard


 _CITATION = """
@@ -28,11 +28,37 @@ _CITATION = """
 """


-class LAMBADA_cloze(LAMBADA):
+class LambadaStandardCloze(LambadaStandard):
+    """Cloze-style LambadaStandard."""
+
    VERSION = 0

    def doc_to_text(self, doc):
-        return doc['text'].rsplit(' ', 1)[0] + " ____. ->"
+        return doc["text"].rsplit(" ", 1)[0] + " ____. ->"
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["text"]
+
+    def doc_to_target(self, doc):
+        return " " + doc["text"].rsplit(" ", 1)[1]
+
+
+class LambadaOpenAICloze(LambadaOpenAI):
+    """Cloze-style LambadaOpenAI."""
+
+    VERSION = 0
+
+    def doc_to_text(self, doc):
+        return doc["text"].rsplit(" ", 1)[0] + " ____. ->"
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["text"]

    def doc_to_target(self, doc):
-        return " " + doc['text'].rsplit(' ', 1)[1]
+        return " " + doc["text"].rsplit(" ", 1)[1]
--- a/lm_eval/tasks/lambada_multilingual.py
+++ b/lm_eval/tasks/lambada_multilingual.py
 """
-The LAMBADA dataset: Word prediction requiring a broad discourse context∗
+The LAMBADA (OpenAI) dataset: Word prediction requiring a broad discourse context∗
 https://arxiv.org/pdf/1606.06031.pdf

-The LAMBADA dataset machine-translated to other languages.
+The LAMBADA OpenAI dataset machine-translated to other languages.
 LAMBADA is a dataset to evaluate the capabilities of computational models for text
 understanding by means of a word prediction task. LAMBADA is a collection of narrative
 passages sharing the characteristic that human subjects are able to guess their last
@@ -12,8 +12,10 @@ cannot simply rely on local context, but must be able to keep track of informati
 in the broader discourse.

 Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
+
+Reference (OpenAI): https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
 """
-from . import lambada
+from .lambada import LambadaOpenAI


 _CITATION = """
@@ -28,37 +30,42 @@ _CITATION = """
 """


-class MultilingualLAMBADA(lambada.LAMBADA):
+class LambadaOpenAIMultilingualEnglish(LambadaOpenAI):
    VERSION = 0
+    DATASET_NAME = "en"


-class MultilingualLAMBADAEN(MultilingualLAMBADA):
-    DATASET_NAME = 'en'
-
-
-class MultilingualLAMBADAFR(MultilingualLAMBADA):
-    DATASET_NAME = 'fr'
+class LambadaOpenAIMultilingualFrench(LambadaOpenAI):
+    VERSION = 0
+    DATASET_NAME = "fr"


-class MultilingualLAMBADADE(MultilingualLAMBADA):
-    DATASET_NAME = 'de'
+class LambadaOpenAIMultilingualGerman(LambadaOpenAI):
+    VERSION = 0
+    DATASET_NAME = "de"


-class MultilingualLAMBADAIT(MultilingualLAMBADA):
-    DATASET_NAME = 'it'
+class LambadaOpenAIMultilingualItalian(LambadaOpenAI):
+    VERSION = 0
+    DATASET_NAME = "it"


-class MultilingualLAMBADAES(MultilingualLAMBADA):
-    DATASET_NAME = 'es'
+class LambadaOpenAIMultilingualSpanish(LambadaOpenAI):
+    VERSION = 0
+    DATASET_NAME = "es"


-LANG_CLASSES = [MultilingualLAMBADAEN, MultilingualLAMBADAFR,
-                MultilingualLAMBADADE, MultilingualLAMBADAIT,
-                MultilingualLAMBADAES]
+LANG_CLASSES = [
+    LambadaOpenAIMultilingualEnglish,
+    LambadaOpenAIMultilingualFrench,
+    LambadaOpenAIMultilingualGerman,
+    LambadaOpenAIMultilingualItalian,
+    LambadaOpenAIMultilingualSpanish,
+]


 def construct_tasks():
    tasks = {}
    for lang_class in LANG_CLASSES:
-        tasks[f"lambada_mt_{lang_class.DATASET_NAME}"] = lang_class
+        tasks[f"lambada_openai_mt_{lang_class.DATASET_NAME}"] = lang_class
    return tasks
--- a/lm_eval/tasks/logiqa.py
+++ b/lm_eval/tasks/logiqa.py
@@ -70,12 +70,20 @@ class LogiQA(MultipleChoiceTask):
                prompt += f"{choice.upper()}. {option}\n"
            prompt += "Answer:"
            return prompt
-        choices = ['a', 'b', 'c', 'd']
+
+        choices = ["a", "b", "c", "d"]
        return {
+            "passage": doc["context"],  # Used for decontamination
            "query": format_example(doc, choices),
            "choices": doc["options"],
-            "gold": choices.index(doc["label"])
+            "gold": choices.index(doc["label"]),
        }

    def doc_to_text(self, doc):
        return doc["query"]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["passage"]
--- a/lm_eval/tasks/mathqa.py
+++ b/lm_eval/tasks/mathqa.py
@@ -50,11 +50,14 @@ class MathQA(MultipleChoiceTask):
        return map(self._process_doc, self.dataset["test"])

    def _process_doc(self, doc):
-        answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
-        choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
+        answer_idx = ["a", "b", "c", "d", "e"].index(doc["correct"])
+        choices = [
+            c[4:].rstrip(" ,")
+            for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc["options"])
+        ]

        out_doc = {
-            "query": "Question: " + doc['Problem'] + "\nAnswer:",
+            "query": "Question: " + doc["Problem"] + "\nAnswer:",
            "choices": choices,
            "gold": answer_idx,
        }
@@ -62,3 +65,9 @@ class MathQA(MultipleChoiceTask):

    def doc_to_text(self, doc):
        return doc["query"]
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["query"]
--- a/lm_eval/tasks/mc_taco.py
+++ b/lm_eval/tasks/mc_taco.py
@@ -55,14 +55,22 @@ class MCTACO(Task):
        return self.dataset["test"]

    def doc_to_text(self, doc):
-        return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
+        return (
+            f"{doc['sentence']}\nQuestion: {doc['question']}\n"
            f"Answer: {doc['answer']}\nPlausible:"
+        )
+
+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["question"] + " " + doc["sentence"]

    def doc_to_target(self, doc):
-        return " " + ["no", "yes"][doc['label']]
+        return " " + ["no", "yes"][doc["label"]]

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
@@ -87,18 +95,15 @@ class MCTACO(Task):
            The results of the requests created in construct_requests.
        """
        ll_no, ll_yes = results
-        gold = doc['label']
+        gold = doc["label"]
        pred = int(ll_yes > ll_no)
        question_id = self._question2id(doc)
        items = (gold, pred, question_id)
-        return {
-            "em": items,
-            "f1": items
-        }
+        return {"em": items, "f1": items}

    def _question2id(self, doc):
-        """ Returns an identifier for the question in the given document. """
-        return " ".join([doc['sentence'], doc['question']])
+        """Returns an identifier for the question in the given document."""
+        return " ".join([doc["sentence"], doc["question"]])

    def aggregation(self):
        return {
@@ -126,7 +131,7 @@ def exact_match(items):


 def f1(items):
-    """ See section 4 "Evaluation Metrics" in the paper about the F1 metric used. """
+    """See section 4 "Evaluation Metrics" in the paper about the F1 metric used."""
    results = list(zip(*items))
    # Group the positive ("yes" = 1) golds and predictions by question.
    gold_positives, pred_positives = defaultdict(list), defaultdict(list)
@@ -140,5 +145,5 @@ def f1(items):
        p = tp / pp if pp > 0.0 else 1.0
        r = tp / gp if gp > 0.0 else 1.0
        if p + r > 0.0:
-            f1.append(2. * (p * r) / (p + r))
+            f1.append(2.0 * (p * r) / (p + r))
    return np.mean(f1)
--- a/lm_eval/tasks/mgsm.py
+++ b/lm_eval/tasks/mgsm.py
+"""
+Language Models are Multilingual Chain-of-Thought Reasoners
+https://arxiv.org/abs/2210.03057
+
+Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems, proposed in the paper [Language models are multilingual chain-of-thought reasoners](http://arxiv.org/abs/2210.03057).
+
+The same 250 problems from [GSM8K](https://arxiv.org/abs/2110.14168) are each translated via human annotators in 10 languages. The 10 languages are:
+- Spanish
+- French
+- German
+- Russian
+- Chinese
+- Japanese
+- Thai
+- Swahili
+- Bengali
+- Telugu
+
+GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality linguistically diverse grade school math word problems. The dataset was created to support the task of question answering on basic mathematical problems that require multi-step reasoning.
+
+You can find the input and targets for each of the ten languages (and English) as `.tsv` files.
+We also include few-shot exemplars that are also manually translated from each language in `exemplars.py`.
+
+Homepage: https://github.com/google-research/url-nlp/tree/main/mgsm
+"""
+import re
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean
+
+
+_CITATION = """
+@misc{cobbe2021training,
+    title={Training Verifiers to Solve Math Word Problems},
+    author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
+    year={2021},
+    eprint={2110.14168},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}
+@misc{shi2022language,
+    title={Language Models are Multilingual Chain-of-Thought Reasoners},
+    author={Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
+    year={2022},
+    eprint={2210.03057},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+ANS_RE = re.compile(r"(\-?\d+)")
+INVALID_ANS = "[invalid]"
+
+
+class MGSM(Task):
+    VERSION = 0
+    DATASET_PATH = "juletxara/mgsm"
+    DATASET_NAME = None
+    QUESTION = "Question:"
+    ANSWER = "Step-by-Step Answer:"
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        raise NotImplementedError
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        if doc["answer"] is not None:
+            return doc["question"] + "\n" + self.ANSWER
+        else:
+            return self.QUESTION + " " + doc["question"] + "\n" + self.ANSWER
+
+    def doc_to_target(self, doc):
+        if doc["answer"] is not None:
+            return " " + doc["answer"][len(self.ANSWER) + 1 :]
+        else:
+            return " " + str(doc["answer_number"])
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        completion = rf.greedy_until(ctx, {"until": ["\n", ":", self.QUESTION]})
+        return completion
+
+    def _extract_answer(self, completion):
+        match = re.findall(ANS_RE, completion)
+        if match:
+            return int(match[-1])
+        else:
+            return INVALID_ANS
+
+    def _is_correct(self, completion, answer):
+        gold = answer
+        assert gold != INVALID_ANS, "No ground truth answer found in the document."
+        return self._extract_answer(completion) == gold
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        completion = results[0]
+        answer = doc["answer_number"]
+        return {"acc": self._is_correct(completion, answer)}
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {"acc": mean}
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {"acc": True}
+
+
+class MGSM_English(MGSM):
+    DATASET_NAME = "en"
+    QUESTION = "Question:"
+    ANSWER = "Step-by-Step Answer:"
+
+
+class MGSM_Spanish(MGSM):
+    DATASET_NAME = "es"
+    QUESTION = "Pregunta:"
+    ANSWER = "Respuesta paso a paso:"
+
+
+class MGSM_French(MGSM):
+    DATASET_NAME = "fr"
+    QUESTION = "Question :"
+    ANSWER = "R\u00e9ponse \u00e9tape par \u00e9tape :"
+
+
+class MGSM_German(MGSM):
+    DATASET_NAME = "de"
+    QUESTION = "Frage:"
+    ANSWER = "Schritt-f\u00fcr-Schritt-Antwort:"
+
+
+class MGSM_Russian(MGSM):
+    DATASET_NAME = "ru"
+    QUESTION = "\u0417\u0430\u0434\u0430\u0447\u0430:"
+    ANSWER = "\u041f\u043e\u0448\u0430\u0433\u043e\u0432\u043e\u0435\u0440\u0435\u0448\u0435\u043d\u0438\u0435:"
+
+
+class MGSM_Chinese(MGSM):
+    DATASET_NAME = "zh"
+    QUESTION = "\u95ee\u9898:"
+    ANSWER = "\u9010\u6b65\u89e3\u7b54:"
+
+
+class MGSM_Japanese(MGSM):
+    DATASET_NAME = "ja"
+    QUESTION = "\u554f\u984c:"
+    ANSWER = "\u30b9\u30c6\u30c3\u30d7\u3054\u3068\u306e\u7b54\u3048:"
+
+
+class MGSM_Thai(MGSM):
+    DATASET_NAME = "th"
+    QUESTION = "\u0e42\u0e08\u0e17\u0e22\u0e4c:"
+    ANSWER = "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e17\u0e35\u0e25\u0e30\u0e02\u0e31\u0e49\u0e19\u0e15\u0e2d\u0e19:"
+
+
+class MGSM_Swahili(MGSM):
+    DATASET_NAME = "sw"
+    QUESTION = "Swali:"
+    ANSWER = "Jibu la Hatua kwa Hatua:"
+
+
+class MGSM_Bengali(MGSM):
+    DATASET_NAME = "bn"
+    QUESTION = "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:"
+    ANSWER = "\u09a7\u09be\u09aa\u09c7 \u09a7\u09be\u09aa\u09c7 \u0989\u09a4\u09cd\u09a4\u09b0:"
+
+
+class MGSM_Telugu(MGSM):
+    DATASET_NAME = "te"
+    QUESTION = "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:"
+    ANSWER = "\u0c26\u0c36\u0c32\u0c35\u0c3e\u0c30\u0c40\u0c17\u0c3e \u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02:"
+
+
+LANGS = ["en", "es", "fr", "de", "ru", "zh", "ja", "th", "sw", "bn", "te"]
+
+LANG_CLASSES = [
+    MGSM_English,
+    MGSM_Spanish,
+    MGSM_French,
+    MGSM_German,
+    MGSM_Russian,
+    MGSM_Chinese,
+    MGSM_Japanese,
+    MGSM_Thai,
+    MGSM_Swahili,
+    MGSM_Bengali,
+    MGSM_Telugu,
+]
+
+
+def construct_tasks():
+    tasks = {}
+    for lang, lang_class in zip(LANGS, LANG_CLASSES):
+        tasks[f"mgsm_{lang}"] = lang_class
+    return tasks
--- a/lm_eval/tasks/mutual.py
+++ b/lm_eval/tasks/mutual.py
@@ -29,7 +29,7 @@ class MuTualBase(Task):
    VERSION = 1
    DATASET_PATH = inspect.getfile(lm_eval.datasets.mutual.mutual)
    DATASET_NAME = None
-    CHOICES = ['A', 'B', 'C', 'D']
+    CHOICES = ["A", "B", "C", "D"]

    def has_training_docs(self):
        return True
@@ -52,6 +52,12 @@ class MuTualBase(Task):
    def doc_to_text(self, doc):
        return self.detokenize(doc["article"])

+    def should_decontaminate(self):
+        return True
+
+    def doc_to_decontamination_query(self, doc):
+        return doc["article"]
+
    def doc_to_target(self, doc):
        return " " + self.detokenize(doc["options"][self.CHOICES.index(doc["answers"])])

@@ -82,26 +88,14 @@ class MuTualBase(Task):
        r4_1 = np.argmax(results) == gold  # r4_1 = accuracy
        ranks = sorted(results, reverse=True)
        r4_2 = (ranks.index(results[gold]) == 1) + r4_1
-        mrr = 1. / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
-        return {
-            "r@1": r4_1,
-            "r@2": r4_2,
-            "mrr": mrr
-        }
+        mrr = 1.0 / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
+        return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr}

    def aggregation(self):
-        return {
-            "r@1": mean,
-            "r@2": mean,
-            "mrr": mean
-        }
+        return {"r@1": mean, "r@2": mean, "mrr": mean}

    def higher_is_better(self):
-        return {
-            "r@1": True,
-            "r@2": True,
-            "mrr": True
-        }
+        return {"r@1": True, "r@2": True, "mrr": True}


 class MuTual(MuTualBase):