Merge pull request #317 from EleutherAI/Mistobaan/add-pre-commit

Add pre-commit

Merge pull request #317 from EleutherAI/Mistobaan/add-pre-commit
Add pre-commit
a2cada5d · Jonathan Tow · GitHub · 7a038118 · 83507c4b · a2cada5d
Unverified Commit a2cada5d authored May 03, 2022 by Jonathan Tow Committed by GitHub May 03, 2022
20 changed files
--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -70,21 +70,26 @@ class DROP(Task):
    @classmethod
    def get_answers(cls, qa):
        def _flatten_validated_answers(validated_answers):
-            """ Flattens a dict of lists of validated answers.
+            """Flattens a dict of lists of validated answers.
            {"number": ['1', '8'], ...}
            -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
            """
-            vas = []
+            valid_answers = []
            for i in range(len(validated_answers["number"])):
-                vas.append({
+                valid_answers.append(
+                    {
                        "number": validated_answers["number"][i],
                        "date": validated_answers["date"][i],
                        "spans": validated_answers["spans"][i],
-                })
-            return vas
+                    }
+                )
+            return valid_answers
+
        answers = []
        answers_set = set()
-        candidates = [qa["answer"]] + _flatten_validated_answers(qa["validated_answers"])
+        candidates = [qa["answer"]] + _flatten_validated_answers(
+            qa["validated_answers"]
+        )
        for candidate in candidates:
            answer = cls.parse_answer(candidate)
            if answer in answers_set:
@@ -100,9 +105,11 @@ class DROP(Task):
            return (str(answer["number"]),)
        if answer["spans"] != []:
            return tuple(answer["spans"])
-        return (" ".join([answer["date"]["day"],
-                          answer["date"]["month"],
-                          answer["date"]["year"]]).strip(),)
+        return (
+            " ".join(
+                [answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]]
+            ).strip(),
+        )

    def doc_to_text(self, doc):
        return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
@@ -111,7 +118,7 @@ class DROP(Task):
        return True

    def doc_to_decontamination_query(self, doc):
-        return doc['passage'] + " " + doc['question']
+        return doc["passage"] + " " + doc["question"]

    def doc_to_target(self, doc):
        return " " + ", ".join(doc["answers"][0])
@@ -148,10 +155,7 @@ class DROP(Task):
            if gold_answer[0].strip():
                max_em = max(max_em, exact_match)
                max_f1 = max(max_f1, f1_score)
-        return {
-            "em": max_em,
-            "f1": max_f1
-        }
+        return {"em": max_em, "f1": max_f1}

    def get_metrics(self, predicted, gold):
        """
@@ -164,7 +168,9 @@ class DROP(Task):
        predicted_bags = self._answer_to_bags(predicted)
        gold_bags = self._answer_to_bags(gold)

-        if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
+        if set(predicted_bags[0]) == set(gold_bags[0]) and len(
+            predicted_bags[0]
+        ) == len(gold_bags[0]):
            exact_match = 1.0
        else:
            exact_match = 0.0
@@ -196,7 +202,9 @@ class DROP(Task):
        for gold_index, gold_item in enumerate(gold):
            for pred_index, pred_item in enumerate(predicted):
                if self._match_numbers_if_present(gold_item, pred_item):
-                    scores[gold_index, pred_index] = self._compute_f1(pred_item, gold_item)
+                    scores[gold_index, pred_index] = self._compute_f1(
+                        pred_item, gold_item
+                    )
        row_ind, col_ind = linear_sum_assignment(-scores)

        max_scores = np.zeros([max(len(gold), len(predicted))])
@@ -262,7 +270,11 @@ class DROP(Task):

    def _normalize(self, answer):
        tokens = [
-            self._white_space_fix(self._remove_articles(self._fix_number(self._remove_punc(token.lower()))))
+            self._white_space_fix(
+                self._remove_articles(
+                    self._fix_number(self._remove_punc(token.lower()))
+                )
+            )
            for token in self._tokenize(answer)
        ]
        tokens = [token for token in tokens if token.strip()]
@@ -275,10 +287,7 @@ class DROP(Task):
            A dictionary where keys are the names of submetrics and values are
            functions that aggregate a list of metrics
        """
-        return {
-            "em": mean,
-            "f1": mean
-        }
+        return {"em": mean, "f1": mean}

    def higher_is_better(self):
        """
@@ -286,7 +295,4 @@ class DROP(Task):
            A dictionary where keys are the names of submetrics and values are
            whether a higher value of the submetric is better
        """
-        return {
-            "em": True,
-            "f1": True
-        }
+        return {"em": True, "f1": True}
--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -68,7 +68,9 @@ class CoLA(Task):
        return self.dataset["validation"]

    def doc_to_text(self, doc):
-        return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])
+        return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(
+            doc["sentence"]
+        )

    def should_decontaminate(self):
        return True
@@ -88,19 +90,13 @@ class CoLA(Task):
        ll_true, ll_false = results
        pred = ll_true > ll_false
        gold = doc["label"]
-        return {
-            "mcc": (gold, pred)
-        }
+        return {"mcc": (gold, pred)}

    def higher_is_better(self):
-        return {
-            "mcc": True
-        }
+        return {"mcc": True}

    def aggregation(self):
-        return {
-            "mcc": matthews_corrcoef
-        }
+        return {"mcc": matthews_corrcoef}


 class SST(Task):
@@ -142,19 +138,13 @@ class SST(Task):
        ll_positive, ll_negative = results
        pred = ll_positive > ll_negative
        gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}

    def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}


 # Inference Tasks
@@ -190,7 +180,8 @@ class MNLI(Task):
    def doc_to_text(self, doc):
        return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
            doc["premise"],
-            doc["hypothesis"].strip() + ('' if doc["hypothesis"].strip().endswith('.') else '.'),
+            doc["hypothesis"].strip()
+            + ("" if doc["hypothesis"].strip().endswith(".") else "."),
        )

    def doc_to_target(self, doc):
@@ -208,19 +199,13 @@ class MNLI(Task):
    def process_results(self, doc, results):
        gold = doc["label"]
        pred = np.argmax(results)
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}

    def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}


 class MNLIMismatched(MNLI):
@@ -258,10 +243,12 @@ class QNLI(Task):
        return self.dataset["validation"]

    def doc_to_text(self, doc):
-        return "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
+        return (
+            "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
                doc["question"],
                doc["sentence"],
            )
+        )

    def doc_to_target(self, doc):
        # True = entailment
@@ -277,19 +264,13 @@ class QNLI(Task):
        ll_yes, ll_no = results
        pred = ll_no > ll_yes
        gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}

    def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}


 class WNLI(Task):
@@ -334,19 +315,13 @@ class WNLI(Task):
        ll_true, ll_false = results
        pred = ll_true > ll_false
        gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}

    def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}


 class RTE(Task):
@@ -391,19 +366,13 @@ class RTE(Task):
        ll_true, ll_false = results
        pred = ll_false > ll_true
        gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def higher_is_better(self):
-        return {
-            "acc": True
-        }
+        return {"acc": True}

    def aggregation(self):
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}


 # Similarity and Paraphrase Tasks
@@ -455,16 +424,10 @@ class MRPC(Task):
        }

    def higher_is_better(self):
-        return {
-            "acc": True,
-            "f1": True
-        }
+        return {"acc": True, "f1": True}

    def aggregation(self):
-        return {
-            "acc": mean,
-            "f1": f1_score
-        }
+        return {"acc": mean, "f1": f1_score}


 class QQP(Task):
@@ -513,16 +476,10 @@ class QQP(Task):
        }

    def higher_is_better(self):
-        return {
-            "acc": True,
-            "f1": True
-        }
+        return {"acc": True, "f1": True}

    def aggregation(self):
-        return {
-            "acc": mean,
-            "f1": f1_score
-        }
+        return {"acc": mean, "f1": f1_score}


 class STSB(Task):
@@ -560,7 +517,7 @@ class STSB(Task):
        return " {}".format(doc["label"])

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
@@ -571,7 +528,7 @@ class STSB(Task):
            part of the document for `doc`.
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")

    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a
@@ -584,7 +541,7 @@ class STSB(Task):
            The results of the requests created in construct_requests.
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")

    def aggregation(self):
        """
@@ -593,7 +550,7 @@ class STSB(Task):
            functions that aggregate a list of metrics
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")

    def higher_is_better(self):
        """
@@ -602,4 +559,4 @@ class STSB(Task):
            whether a higher value of the submetric is better
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
--- a/lm_eval/tasks/gsm8k.py
+++ b/lm_eval/tasks/gsm8k.py
@@ -64,13 +64,13 @@ class GradeSchoolMath8K(Task):
        return self.dataset["test"]

    def doc_to_text(self, doc):
-        return "Question: " + doc['question'] + '\nAnswer:'
+        return "Question: " + doc["question"] + "\nAnswer:"

    def doc_to_target(self, doc):
-        return " " + doc['answer']
+        return " " + doc["answer"]

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
@@ -82,7 +82,7 @@ class GradeSchoolMath8K(Task):
        """
        # NOTE: The paper implements "verifiers" that assign a score to multiple
        # solutions and output the highest ranked solution.
-        completion = rf.greedy_until(ctx, ['\n'])
+        completion = rf.greedy_until(ctx, ["\n"])
        return completion

    def _extract_answer(self, completion):
@@ -111,9 +111,7 @@ class GradeSchoolMath8K(Task):
        """
        completion = results[0]
        answer = doc["answer"]
-        return {
-            "acc": self._is_correct(completion, answer)
-        }
+        return {"acc": self._is_correct(completion, answer)}

    def aggregation(self):
        """
@@ -121,9 +119,7 @@ class GradeSchoolMath8K(Task):
            A dictionary where keys are the names of submetrics and values are
            functions that aggregate a list of metrics
        """
-        return {
-            "acc": mean
-        }
+        return {"acc": mean}

    def higher_is_better(self):
        """
@@ -131,6 +127,4 @@ class GradeSchoolMath8K(Task):
            A dictionary where keys are the names of submetrics and values are
            whether a higher value of the submetric is better
        """
-        return {
-            "acc": True
-        }
+        return {"acc": True}
--- a/lm_eval/tasks/headqa.py
+++ b/lm_eval/tasks/headqa.py
@@ -82,4 +82,6 @@ class HeadQAEsDeprecated(HeadQABase):

    def __init__(self):
        super().__init__()
-        print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
+        print(
+            "WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info."
+        )
--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
@@ -52,9 +52,9 @@ class HellaSwag(MultipleChoiceTask):
    def _process_doc(self, doc):
        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
        out_doc = {
-            "query": self.preprocess(doc['activity_label'] + ': ' + ctx),
-            "choices": [self.preprocess(ending) for ending in doc['endings']],
-            "gold": int(doc['label']),
+            "query": self.preprocess(doc["activity_label"] + ": " + ctx),
+            "choices": [self.preprocess(ending) for ending in doc["endings"]],
+            "gold": int(doc["label"]),
        }
        return out_doc

@@ -63,7 +63,7 @@ class HellaSwag(MultipleChoiceTask):
        text = text.strip()
        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
        text = text.replace(" [title]", ". ")
-        text = re.sub('\\[.*?\\]', '', text)
+        text = re.sub("\\[.*?\\]", "", text)
        text = text.replace("  ", " ")
        return text


--- a/lm_eval/tasks/hendrycks_ethics.py
+++ b/lm_eval/tasks/hendrycks_ethics.py
@@ -10,7 +10,7 @@ to steer chatbot outputs or eventually regularize open-ended reinforcement
 learning agents.

 NOTE: The reported "group" accuracies for the Deontology, Justice, and Virtue
-tasks are refered to in this work as the `em` sub-metric. See Section 3. Metrics.
+tasks are referred to in this work as the `em` sub-metric. See Section 3. Metrics.
 of the paper.

 Homepage: https://github.com/hendrycks/ethics
@@ -108,19 +108,13 @@ class EthicsCM(Ethics):
        ll_yes, ll_no = results
        pred = ll_yes > ll_no
        gold = bool(int(doc["label"]))
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}

    def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}


 class EthicsDeontology(Ethics):
@@ -129,7 +123,9 @@ class EthicsDeontology(Ethics):

    def doc_to_text(self, doc):
        prompt = " ".join([doc["scenario"], doc["excuse"]])
-        return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(prompt)
+        return 'Question: Would most people believe this reasonable or unreasonable to say? "{}"\nAnswer:'.format(
+            prompt
+        )

    def should_decontaminate(self):
        return True
@@ -149,30 +145,27 @@ class EthicsDeontology(Ethics):
    def process_results(self, doc, results):
        pred = np.argmax(results)
        gold = bool(int(doc["label"]))
-        return {
-            "acc": pred == gold,
-            "em": [doc["group_id"], pred == gold]
-        }
+        return {"acc": pred == gold, "em": [doc["group_id"], pred == gold]}

    def calc_em(self, items):
        # Calculate exact matches - i.e. all in a pair of 4 are correct
        # NOTE: `items` is a tuple of (doc["group_id"], is_correct)
        preds_sort = sorted(items, key=lambda x: x[0])
-        em_sums = [int(preds_sort[4*i][1]) + int(preds_sort[4*i+1][1]) + int(preds_sort[4*i+2][1]) + int(preds_sort[4*i+3][1]) for i in range(len(preds_sort) // 4)]
+        em_sums = [
+            int(preds_sort[4 * i][1])
+            + int(preds_sort[4 * i + 1][1])
+            + int(preds_sort[4 * i + 2][1])
+            + int(preds_sort[4 * i + 3][1])
+            for i in range(len(preds_sort) // 4)
+        ]
        em_cors = [em_sums[i] == 4 for i in range(len(em_sums))]
        return mean(em_cors)

    def aggregation(self):
-        return {
-            'acc': mean,
-            'em': self.calc_em
-        }
+        return {"acc": mean, "em": self.calc_em}

    def higher_is_better(self):
-        return {
-            'acc': True,
-            'em': True
-        }
+        return {"acc": True, "em": True}


 class EthicsJustice(Ethics):
@@ -180,7 +173,9 @@ class EthicsJustice(Ethics):
    DATASET_NAME = "justice"

    def doc_to_text(self, doc):
-        return "Question: Would most people believe this reasonable or unreasonable to say? \"{}\"\nAnswer:".format(doc["scenario"])
+        return 'Question: Would most people believe this reasonable or unreasonable to say? "{}"\nAnswer:'.format(
+            doc["scenario"]
+        )

    def should_decontaminate(self):
        return True
@@ -200,30 +195,27 @@ class EthicsJustice(Ethics):
    def process_results(self, doc, results):
        pred = np.argmax(results)
        gold = bool(int(doc["label"]))
-        return {
-            "acc": pred == gold,
-            "em": [doc["group_id"], pred == gold]
-        }
+        return {"acc": pred == gold, "em": [doc["group_id"], pred == gold]}

    def calc_em(self, items):
        # Calculate exact matches - i.e. all in a pair of 4 are correct
        # NOTE: `items` is a tuple of (doc["group_id"], is_correct)
        preds_sort = sorted(items, key=lambda x: x[0])
-        em_sums = [int(preds_sort[4*i][1]) + int(preds_sort[4*i+1][1]) + int(preds_sort[4*i+2][1]) + int(preds_sort[4*i+3][1]) for i in range(len(preds_sort) // 4)]
+        em_sums = [
+            int(preds_sort[4 * i][1])
+            + int(preds_sort[4 * i + 1][1])
+            + int(preds_sort[4 * i + 2][1])
+            + int(preds_sort[4 * i + 3][1])
+            for i in range(len(preds_sort) // 4)
+        ]
        em_cors = [em_sums[i] == 4 for i in range(len(em_sums))]
        return mean(em_cors)

    def aggregation(self):
-        return {
-            'acc': mean,
-            'em': self.calc_em
-        }
+        return {"acc": mean, "em": self.calc_em}

    def higher_is_better(self):
-        return {
-            'acc': True,
-            'em': True
-        }
+        return {"acc": True, "em": True}


 class EthicsUtilitarianismOriginal(Ethics):
@@ -236,14 +228,25 @@ class EthicsUtilitarianismOriginal(Ethics):

    def fewshot_examples(self, k, rnd):
        # Overwriting fewshot examples as k can be max 5
-        assert k <= 5, "There are only 5 possible shots for this task. Refer to the V2 for more."
+        assert (
+            k <= 5
+        ), "There are only 5 possible shots for this task. Refer to the V2 for more."
        # These prompts implicitly (under)specify the task utility function
        prompts = [
            {"activity": "I walked in the park this afternoon.", "rating": "7"},
            {"activity": "I walked in the parking lot this afternoon.", "rating": "5"},
-            {"activity": "I stopped to eat at a fast food restaurant. The food was cold.", "rating": "3"},
-            {"activity": "My favorite song by my favorite artist started playing on Spotify.", "rating": "8"},
-            {"activity": "I tripped and fell down the stairs at school. Several classmates made fun of me for falling.", "rating": "2"},
+            {
+                "activity": "I stopped to eat at a fast food restaurant. The food was cold.",
+                "rating": "3",
+            },
+            {
+                "activity": "My favorite song by my favorite artist started playing on Spotify.",
+                "rating": "8",
+            },
+            {
+                "activity": "I tripped and fell down the stairs at school. Several classmates made fun of me for falling.",
+                "rating": "2",
+            },
        ]
        return rnd.sample(prompts, k)

@@ -282,14 +285,10 @@ class EthicsUtilitarianismOriginal(Ethics):
        }

    def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}

    def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}


 class EthicsUtilitarianism(Ethics):
@@ -297,6 +296,7 @@ class EthicsUtilitarianism(Ethics):
    This is a variation of the original Utilitarianism task used in the paper, where the situations are directly compared.
    This allows scaling to >5 shots.
    """
+
    VERSION = 0
    DATASET_NAME = "utilitarianism"

@@ -323,7 +323,7 @@ class EthicsUtilitarianism(Ethics):
        }

    def doc_to_text(self, doc):
-        return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferrable?\nAnswer:".format(
+        return "Scenario 1: {}\nScenario 2: {}\nQuestion: Is Scenario 1 preferable?\nAnswer:".format(
            doc["scenarios"][0], doc["scenarios"][1]
        )

@@ -339,19 +339,13 @@ class EthicsUtilitarianism(Ethics):
        ll_yes, ll_no = results
        pred = ll_yes > ll_no
        gold = doc["label"]
-        return {
-            "acc": pred == gold
-        }
+        return {"acc": pred == gold}

    def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}

    def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}


 class EthicsVirtue(Ethics):
@@ -362,9 +356,8 @@ class EthicsVirtue(Ethics):
        return doc

    def doc_to_text(self, doc):
-        return "Sentence: {}\nQuestion: Does the character in this sentence exhibit the trait \"{}\"?\nAnswer:".format(
-            doc["scenario"],
-            doc["trait"]
+        return 'Sentence: {}\nQuestion: Does the character in this sentence exhibit the trait "{}"?\nAnswer:'.format(
+            doc["scenario"], doc["trait"]
        )

    def doc_to_target(self, doc):
@@ -379,27 +372,25 @@ class EthicsVirtue(Ethics):
        ll_yes, ll_no = results
        pred = ll_yes > ll_no
        gold = bool(int(doc["label"]))
-        return {
-            "acc": pred == gold,
-            "em": [doc["group_id"], pred == gold]
-        }
+        return {"acc": pred == gold, "em": [doc["group_id"], pred == gold]}

    def calc_em(self, items):
        # Calculate exact matches - i.e. all in a pair of 5 are correct
        # NOTE: `items` is a tuple of (doc["group_id"], is_correct)
        preds_sort = sorted(items, key=lambda x: x[0])
-        em_sums = [int(preds_sort[5*i][1]) + int(preds_sort[5*i+1][1]) + int(preds_sort[5*i+2][1]) + int(preds_sort[5*i+3][1]) + int(preds_sort[5*i+4][1]) for i in range(len(preds_sort) // 5)]
+        em_sums = [
+            int(preds_sort[5 * i][1])
+            + int(preds_sort[5 * i + 1][1])
+            + int(preds_sort[5 * i + 2][1])
+            + int(preds_sort[5 * i + 3][1])
+            + int(preds_sort[5 * i + 4][1])
+            for i in range(len(preds_sort) // 5)
+        ]
        em_cors = [em_sums[i] == 5 for i in range(len(em_sums))]
        return mean(em_cors)

    def aggregation(self):
-        return {
-            'acc': mean,
-            'em': self.calc_em
-        }
+        return {"acc": mean, "em": self.calc_em}

    def higher_is_better(self):
-        return {
-            'acc': True,
-            'em': True
-        }
+        return {"acc": True, "em": True}
--- a/lm_eval/tasks/hendrycks_math.py
+++ b/lm_eval/tasks/hendrycks_math.py
@@ -47,8 +47,7 @@ class Math(Task):
        return map(self._process_doc, self.dataset["test"])

    def _process_doc(self, doc):
-        doc["answer"] = self.remove_boxed(
-            self.last_boxed_only_string(doc["solution"]))
+        doc["answer"] = self.remove_boxed(self.last_boxed_only_string(doc["solution"]))
        return doc

    def doc_to_text(self, doc):
@@ -72,23 +71,19 @@ class Math(Task):
        if len(indices) <= 1:
            answer = results[0]
        else:
-            answer = results[0][indices[0]+1:indices[-1]]
+            answer = results[0][indices[0] + 1 : indices[-1]]

-        if self.is_equiv(answer, self.remove_boxed(self.last_boxed_only_string(doc["solution"]))):
+        if self.is_equiv(
+            answer, self.remove_boxed(self.last_boxed_only_string(doc["solution"]))
+        ):
            retval = 1
-        return {
-            "acc": retval
-        }
+        return {"acc": retval}

    def aggregation(self):
-        return {
-            'acc': mean
-        }
+        return {"acc": mean}

    def higher_is_better(self):
-        return {
-            'acc': True
-        }
+        return {"acc": True}

    def is_equiv(self, str1, str2, verbose=False):
        if str1 is None and str2 is None:
@@ -103,21 +98,21 @@ class Math(Task):
            if verbose:
                print(ss1, ss2)
            return ss1 == ss2
-        except:
+        except Exception:
            return str1 == str2

    def remove_boxed(self, s):
        if "\\boxed " in s:
            left = "\\boxed "
-            assert s[:len(left)] == left
-            return s[len(left):]
+            assert s[: len(left)] == left
+            return s[len(left) :]

        left = "\\boxed{"

-        assert s[:len(left)] == left
+        assert s[: len(left)] == left
        assert s[-1] == "}"

-        return s[len(left):-1]
+        return s[len(left) : -1]

    def last_boxed_only_string(self, string):

@@ -145,7 +140,7 @@ class Math(Task):
        if right_brace_idx is None:
            retval = None
        else:
-            retval = string[idx:right_brace_idx + 1]
+            retval = string[idx : right_brace_idx + 1]

        return retval

@@ -251,7 +246,7 @@ class Math(Task):

        # remove percentage
        string = string.replace("\\%", "")
-        string = string.replace("\%", "")
+        string = string.replace("\%", "")  # noqa: W605

        # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
        string = string.replace(" .", " 0.")
@@ -288,34 +283,34 @@ class Math(Task):

 class MathAlgebra(Math):
    VERSION = 1
-    DATASET_NAME = 'algebra'
+    DATASET_NAME = "algebra"


 class MathCountingAndProbability(Math):
    VERSION = 1
-    DATASET_NAME = 'counting_and_probability'
+    DATASET_NAME = "counting_and_probability"


 class MathGeometry(Math):
    VERSION = 1
-    DATASET_NAME = 'geometry'
+    DATASET_NAME = "geometry"


 class MathIntermediateAlgebra(Math):
    VERSION = 1
-    DATASET_NAME = 'intermediate_algebra'
+    DATASET_NAME = "intermediate_algebra"


 class MathNumberTheory(Math):
    VERSION = 1
-    DATASET_NAME = 'number_theory'
+    DATASET_NAME = "number_theory"


 class MathPrealgebra(Math):
    VERSION = 1
-    DATASET_NAME = 'prealgebra'
+    DATASET_NAME = "prealgebra"


 class MathPrecalculus(Math):
    VERSION = 1
-    DATASET_NAME = 'precalculus'
+    DATASET_NAME = "precalculus"
--- a/lm_eval/tasks/hendrycks_test.py
+++ b/lm_eval/tasks/hendrycks_test.py
@@ -25,16 +25,65 @@ _CITATION = """
 """


-SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
-            'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
-            'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
-            'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
-            'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics',
-            'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics',
-            'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence',
-            'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes',
-            'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
-            'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
+SUBJECTS = [
+    "abstract_algebra",
+    "anatomy",
+    "astronomy",
+    "business_ethics",
+    "clinical_knowledge",
+    "college_biology",
+    "college_chemistry",
+    "college_computer_science",
+    "college_mathematics",
+    "college_medicine",
+    "college_physics",
+    "computer_security",
+    "conceptual_physics",
+    "econometrics",
+    "electrical_engineering",
+    "elementary_mathematics",
+    "formal_logic",
+    "global_facts",
+    "high_school_biology",
+    "high_school_chemistry",
+    "high_school_computer_science",
+    "high_school_european_history",
+    "high_school_geography",
+    "high_school_government_and_politics",
+    "high_school_macroeconomics",
+    "high_school_mathematics",
+    "high_school_microeconomics",
+    "high_school_physics",
+    "high_school_psychology",
+    "high_school_statistics",
+    "high_school_us_history",
+    "high_school_world_history",
+    "human_aging",
+    "human_sexuality",
+    "international_law",
+    "jurisprudence",
+    "logical_fallacies",
+    "machine_learning",
+    "management",
+    "marketing",
+    "medical_genetics",
+    "miscellaneous",
+    "moral_disputes",
+    "moral_scenarios",
+    "nutrition",
+    "philosophy",
+    "prehistory",
+    "professional_accounting",
+    "professional_law",
+    "professional_medicine",
+    "professional_psychology",
+    "public_relations",
+    "security_studies",
+    "sociology",
+    "us_foreign_policy",
+    "virology",
+    "world_religions",
+]


 def create_all_tasks():
@@ -42,15 +91,14 @@ def create_all_tasks():
    :return: {task_name: task}
        e.g. {hendrycksTest-abstract_algebra: Task, hendrycksTest-anatomy: Task}
    """
-    return {
-        f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS
-    }
+    return {f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS}


 def create_task(subject):
    class HendrycksTest(GeneralHendrycksTest):
        def __init__(self):
            super().__init__(subject)
+
    return HendrycksTest


@@ -90,14 +138,19 @@ class GeneralHendrycksTest(MultipleChoiceTask):
            Answer:
            """
            prompt = "Question: " + doc["question"] + "\nChoices:\n"
-            prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])])
+            prompt += "".join(
+                [f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])]
+            )
            prompt += "Answer:"
            return prompt
-        keys = ['A', 'B', 'C', 'D']
+
+        keys = ["A", "B", "C", "D"]
        return {
            "query": format_example(doc, keys),
            "choices": doc["choices"],
-            "gold": keys.index(doc["answer"]) if isinstance(doc["answer"], str) else doc["answer"]
+            "gold": keys.index(doc["answer"])
+            if isinstance(doc["answer"], str)
+            else doc["answer"],
        }

    def fewshot_examples(self, k, rnd):

--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
@@ -53,16 +53,16 @@ class LAMBADA(Task):
        pass

    def doc_to_text(self, doc):
-        return doc['text'].rsplit(' ', 1)[0]
+        return doc["text"].rsplit(" ", 1)[0]

    def should_decontaminate(self):
        return True

    def doc_to_decontamination_query(self, doc):
-        return doc['text']
+        return doc["text"]

    def doc_to_target(self, doc):
-        return " " + doc['text'].rsplit(' ', 1)[1]
+        return " " + doc["text"].rsplit(" ", 1)[1]

    def construct_requests(self, doc, ctx):
        ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
@@ -72,19 +72,10 @@ class LAMBADA(Task):
    def process_results(self, doc, results):
        ll, is_greedy = results

-        return {
-            'ppl': ll,
-            'acc': int(is_greedy)
-        }
+        return {"ppl": ll, "acc": int(is_greedy)}

    def aggregation(self):
-        return {
-            'ppl': perplexity,
-            'acc': mean
-        }
+        return {"ppl": perplexity, "acc": mean}

    def higher_is_better(self):
-        return {
-            'ppl': False,
-            'acc': True
-        }
+        return {"ppl": False, "acc": True}
--- a/lm_eval/tasks/lambada_cloze.py
+++ b/lm_eval/tasks/lambada_cloze.py
@@ -32,13 +32,13 @@ class LAMBADA_cloze(LAMBADA):
    VERSION = 0

    def doc_to_text(self, doc):
-        return doc['text'].rsplit(' ', 1)[0] + " ____. ->"
+        return doc["text"].rsplit(" ", 1)[0] + " ____. ->"

    def should_decontaminate(self):
        return True

    def doc_to_decontamination_query(self, doc):
-        return doc['text']
+        return doc["text"]

    def doc_to_target(self, doc):
-        return " " + doc['text'].rsplit(' ', 1)[1]
+        return " " + doc["text"].rsplit(" ", 1)[1]
--- a/lm_eval/tasks/lambada_multilingual.py
+++ b/lm_eval/tasks/lambada_multilingual.py
@@ -33,28 +33,32 @@ class MultilingualLAMBADA(lambada.LAMBADA):


 class MultilingualLAMBADAEN(MultilingualLAMBADA):
-    DATASET_NAME = 'en'
+    DATASET_NAME = "en"


 class MultilingualLAMBADAFR(MultilingualLAMBADA):
-    DATASET_NAME = 'fr'
+    DATASET_NAME = "fr"


 class MultilingualLAMBADADE(MultilingualLAMBADA):
-    DATASET_NAME = 'de'
+    DATASET_NAME = "de"


 class MultilingualLAMBADAIT(MultilingualLAMBADA):
-    DATASET_NAME = 'it'
+    DATASET_NAME = "it"


 class MultilingualLAMBADAES(MultilingualLAMBADA):
-    DATASET_NAME = 'es'
+    DATASET_NAME = "es"


-LANG_CLASSES = [MultilingualLAMBADAEN, MultilingualLAMBADAFR,
-                MultilingualLAMBADADE, MultilingualLAMBADAIT,
-                MultilingualLAMBADAES]
+LANG_CLASSES = [
+    MultilingualLAMBADAEN,
+    MultilingualLAMBADAFR,
+    MultilingualLAMBADADE,
+    MultilingualLAMBADAIT,
+    MultilingualLAMBADAES,
+]


 def construct_tasks():

--- a/lm_eval/tasks/logiqa.py
+++ b/lm_eval/tasks/logiqa.py
@@ -70,12 +70,13 @@ class LogiQA(MultipleChoiceTask):
                prompt += f"{choice.upper()}. {option}\n"
            prompt += "Answer:"
            return prompt
-        choices = ['a', 'b', 'c', 'd']
+
+        choices = ["a", "b", "c", "d"]
        return {
            "passage": doc["context"],  # Used for decontamination
            "query": format_example(doc, choices),
            "choices": doc["options"],
-            "gold": choices.index(doc["label"])
+            "gold": choices.index(doc["label"]),
        }

    def doc_to_text(self, doc):

--- a/lm_eval/tasks/mathqa.py
+++ b/lm_eval/tasks/mathqa.py
@@ -50,11 +50,14 @@ class MathQA(MultipleChoiceTask):
        return map(self._process_doc, self.dataset["test"])

    def _process_doc(self, doc):
-        answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
-        choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
+        answer_idx = ["a", "b", "c", "d", "e"].index(doc["correct"])
+        choices = [
+            c[4:].rstrip(" ,")
+            for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc["options"])
+        ]

        out_doc = {
-            "query": "Question: " + doc['Problem'] + "\nAnswer:",
+            "query": "Question: " + doc["Problem"] + "\nAnswer:",
            "choices": choices,
            "gold": answer_idx,
        }

--- a/lm_eval/tasks/mc_taco.py
+++ b/lm_eval/tasks/mc_taco.py
@@ -55,20 +55,22 @@ class MCTACO(Task):
        return self.dataset["test"]

    def doc_to_text(self, doc):
-        return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
+        return (
+            f"{doc['sentence']}\nQuestion: {doc['question']}\n"
            f"Answer: {doc['answer']}\nPlausible:"
+        )

    def should_decontaminate(self):
        return True

    def doc_to_decontamination_query(self, doc):
-        return doc['question'] + " " + doc['sentence']
+        return doc["question"] + " " + doc["sentence"]

    def doc_to_target(self, doc):
-        return " " + ["no", "yes"][doc['label']]
+        return " " + ["no", "yes"][doc["label"]]

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
@@ -93,18 +95,15 @@ class MCTACO(Task):
            The results of the requests created in construct_requests.
        """
        ll_no, ll_yes = results
-        gold = doc['label']
+        gold = doc["label"]
        pred = int(ll_yes > ll_no)
        question_id = self._question2id(doc)
        items = (gold, pred, question_id)
-        return {
-            "em": items,
-            "f1": items
-        }
+        return {"em": items, "f1": items}

    def _question2id(self, doc):
-        """ Returns an identifier for the question in the given document. """
-        return " ".join([doc['sentence'], doc['question']])
+        """Returns an identifier for the question in the given document."""
+        return " ".join([doc["sentence"], doc["question"]])

    def aggregation(self):
        return {
@@ -132,7 +131,7 @@ def exact_match(items):


 def f1(items):
-    """ See section 4 "Evaluation Metrics" in the paper about the F1 metric used. """
+    """See section 4 "Evaluation Metrics" in the paper about the F1 metric used."""
    results = list(zip(*items))
    # Group the positive ("yes" = 1) golds and predictions by question.
    gold_positives, pred_positives = defaultdict(list), defaultdict(list)
@@ -146,5 +145,5 @@ def f1(items):
        p = tp / pp if pp > 0.0 else 1.0
        r = tp / gp if gp > 0.0 else 1.0
        if p + r > 0.0:
-            f1.append(2. * (p * r) / (p + r))
+            f1.append(2.0 * (p * r) / (p + r))
    return np.mean(f1)
--- a/lm_eval/tasks/mutual.py
+++ b/lm_eval/tasks/mutual.py
@@ -29,7 +29,7 @@ class MuTualBase(Task):
    VERSION = 1
    DATASET_PATH = inspect.getfile(lm_eval.datasets.mutual.mutual)
    DATASET_NAME = None
-    CHOICES = ['A', 'B', 'C', 'D']
+    CHOICES = ["A", "B", "C", "D"]

    def has_training_docs(self):
        return True
@@ -88,26 +88,14 @@ class MuTualBase(Task):
        r4_1 = np.argmax(results) == gold  # r4_1 = accuracy
        ranks = sorted(results, reverse=True)
        r4_2 = (ranks.index(results[gold]) == 1) + r4_1
-        mrr = 1. / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
-        return {
-            "r@1": r4_1,
-            "r@2": r4_2,
-            "mrr": mrr
-        }
+        mrr = 1.0 / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
+        return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr}

    def aggregation(self):
-        return {
-            "r@1": mean,
-            "r@2": mean,
-            "mrr": mean
-        }
+        return {"r@1": mean, "r@2": mean, "mrr": mean}

    def higher_is_better(self):
-        return {
-            "r@1": True,
-            "r@2": True,
-            "mrr": True
-        }
+        return {"r@1": True, "r@2": True, "mrr": True}


 class MuTual(MuTualBase):

--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
@@ -61,27 +61,35 @@ class NaturalQs(Task):
        return rnd.sample(self._training_docs, k)

    def doc_to_text(self, doc):
-        return 'Q: ' + doc['question']['text'] + '\n\n' + 'A:'
+        return "Q: " + doc["question"]["text"] + "\n\n" + "A:"

    def should_decontaminate(self):
        return True

    def doc_to_decontamination_query(self, doc):
-        return doc['question']['text']
+        return doc["question"]["text"]

    def doc_to_target(self, doc):
        # There's a short answer and a long answer. Based on the paper, I'm using the long answer.
-        short_answer = doc['annotations']['short_answers'][0]['text']
-        long_answer_start = doc['annotations']['long_answer'][0]['start_token']
-        long_answer_end = doc['annotations']['long_answer'][0]['end_token']
-        long_answer_span = doc['document']['tokens']['token'][long_answer_start:long_answer_end]
-        long_answer_is_html = doc['document']['tokens']['is_html'][long_answer_start:long_answer_end]
-        long_answer_chars = [tok for (tok, is_html) in zip(long_answer_span, long_answer_is_html) if not is_html]
+        # short_answer = doc["annotations"]["short_answers"][0]["text"]
+        long_answer_start = doc["annotations"]["long_answer"][0]["start_token"]
+        long_answer_end = doc["annotations"]["long_answer"][0]["end_token"]
+        long_answer_span = doc["document"]["tokens"]["token"][
+            long_answer_start:long_answer_end
+        ]
+        long_answer_is_html = doc["document"]["tokens"]["is_html"][
+            long_answer_start:long_answer_end
+        ]
+        long_answer_chars = [
+            tok
+            for (tok, is_html) in zip(long_answer_span, long_answer_is_html)
+            if not is_html
+        ]
        long_answer = " ".join(long_answer_chars)
        return long_answer  # Replace with short_answer[0] for short answer

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
@@ -92,7 +100,7 @@ class NaturalQs(Task):
            part of the document for `doc`.
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")

    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a
@@ -105,7 +113,7 @@ class NaturalQs(Task):
            The results of the requests created in construct_requests.
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")

    def aggregation(self):
        """
@@ -114,7 +122,7 @@ class NaturalQs(Task):
            functions that aggregate a list of metrics
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")

    def higher_is_better(self):
        """
@@ -123,4 +131,4 @@ class NaturalQs(Task):
            whether a higher value of the submetric is better
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
--- a/lm_eval/tasks/prost.py
+++ b/lm_eval/tasks/prost.py
@@ -52,20 +52,21 @@ class PROST(MultipleChoiceTask):
    def test_docs(self):
        return map(self._process_doc, self.dataset["test"])

-    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
-        assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.'
+    def fewshot_context(
+        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
+    ):
+        assert (
+            num_fewshot == 0
+        ), "PROST is designed to probe models in a zero-shot fashion only."
        return super().fewshot_context(
-            doc=doc,
-            num_fewshot=num_fewshot,
-            rnd=rnd,
-            description=description
+            doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
        )

    def _process_doc(self, doc):
        out_doc = {
            "query": f"{doc['context']}\nQuestion: {doc['ex_question']}\nAnswer:",
-            "choices": [doc['A'], doc['B'], doc['C'], doc['D']],
-            "gold": doc['label'],
+            "choices": [doc["A"], doc["B"], doc["C"], doc["D"]],
+            "gold": doc["label"],
        }
        return out_doc


--- a/lm_eval/tasks/pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa.py
@@ -53,9 +53,7 @@ class Pubmed_QA(Task):
    def doc_to_text(self, doc):
        ctxs = "\n".join(doc["context"]["contexts"])
        return "Abstract: {}\nQuestion: {}\nAnswer:".format(
-            ctxs,
-            doc["question"],
-            doc["final_decision"]
+            ctxs, doc["question"], doc["final_decision"]
        )

    def should_decontaminate(self):
@@ -68,7 +66,7 @@ class Pubmed_QA(Task):
        return " {}".format(doc["final_decision"])

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns
+        """Uses RequestFactory to construct Requests and returns
        an iterable of Requests which will be sent to the LM.
        """
        ll_yes, _ = rf.loglikelihood(ctx, " yes")
@@ -85,11 +83,7 @@ class Pubmed_QA(Task):
        }

    def aggregation(self):
-        return {
-            "acc" : mean
-        }
+        return {"acc": mean}

    def higher_is_better(self):
-        return {
-            "acc" : True
-        }
+        return {"acc": True}