Delete gsm8k.py

025fa6e8 · Hailey Schoelkopf · GitHub · 2db1e7cb · 2db1e7cb
Unverified Commit 025fa6e8 authored Jun 15, 2023 by Hailey Schoelkopf Committed by GitHub Jun 15, 2023
Show whitespace changes
Inline Side-by-side

Showing with 0 additions and 147 deletions

lm_eval/tasks/gsm8k.py lm_eval/tasks/gsm8k.py +0 -147

No files found.
--- a/lm_eval/tasks/gsm8k.py
+++ b/lm_eval/tasks/gsm8k.py
-"""
-"Training Verifiers to Solve Math Word Problems"
-https://arxiv.org/abs/2110.14168
-State-of-the-art language models can match human performance on many tasks, but
-they still struggle to robustly perform multi-step mathematical reasoning. To
-diagnose the failures of current models and support research, we introduce GSM8K,
-a dataset of 8.5K high quality linguistically diverse grade school math word problems.
-We find that even the largest transformer models fail to achieve high test performance,
-despite the conceptual simplicity of this problem distribution.
-NOTE: See the official implementation of the task:
-    https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py
-for how to make use of the dataset's calculator annotations in your language
-model's sample/generation function.
-Homepage: https://github.com/openai/grade-school-math
-"""
-import re
-from lm_eval import utils
-from lm_eval.api.task import Task
-from lm_eval.api.metrics import mean
-from lm_eval.api.instance import Instance
-from lm_eval.prompts import get_prompt
-from lm_eval.api.registry import register_task, register_group
-_CITATION = """
-@misc{cobbe2021training,
-      title={Training Verifiers to Solve Math Word Problems},
-      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
-      year={2021},
-      eprint={2110.14168},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG}
-}
-"""
-ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
-INVALID_ANS = "[invalid]"
-@register_task("gsm8k")
-class GradeSchoolMath8K(Task):
-    VERSION = 0
-    DATASET_PATH = "gsm8k"
-    DATASET_NAME = "main"
-    OUTPUT_TYPE = "greedy_until"
-    def has_training_docs(self):
-        return True
-    def has_validation_docs(self):
-        return False
-    def has_test_docs(self):
-        return True
-    def training_docs(self):
-        return self.dataset["train"]
-    def validation_docs(self):
-        raise NotImplementedError
-    def test_docs(self):
-        return self.dataset["test"]
-    def doc_to_text(self, doc):
-        doc_to_text = get_prompt("qa-basic:question-newline-answer")
-        return utils.apply_template(doc_to_text, doc)
-        # return "Question: " + doc["question"] + "\nAnswer:"
-    def doc_to_target(self, doc):
-        return " " + doc["answer"]
-    def construct_requests(self, doc, ctx, **kwargs):
-        """Uses RequestFactory to construct Requests and returns an iterable of
-        Requests which will be sent to the LM.
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`.
-        """
-        # NOTE: The paper implements "verifiers" that assign a score to multiple
-        # solutions and output the highest ranked solution.
-        return Instance(
-            request_type=self.OUTPUT_TYPE,
-            doc=doc,
-            arguments=(ctx, ["\n\n"]),
-            idx=0,
-            **kwargs
-        )
-        # completion = rf.greedy_until(ctx, ["\n"])
-        # return completion
-    def _extract_answer(self, completion):
-        match = ANS_RE.search(completion)
-        if match:
-            match_str = match.group(1).strip()
-            match_str = match_str.replace(",", "")
-            return match_str
-        else:
-            return INVALID_ANS
-    def _is_correct(self, completion, answer):
-        gold = self._extract_answer(answer)
-        assert gold != INVALID_ANS, "No ground truth answer found in the document."
-        # return self._extract_answer(completion) == gold
-        # print(completion)
-        return self._extract_answer(completion) == gold
-    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a
-        dict where keys are the names of submetrics and values are the values of
-        the metric for that one document
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        completion = results[0]
-        answer = doc["answer"]
-        return {"acc": self._is_correct(completion, answer)}
-    def aggregation(self):
-        """
-        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are
-            functions that aggregate a list of metrics
-        """
-        return {"acc": mean}
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are
-            whether a higher value of the submetric is better
-        """
-        return {"acc": True}