add math500

4df2c7b1 · Baber · 2793ceb6 · 4df2c7b1 · 4df2c7b1 · 4df2c7b1
Commit 4df2c7b1 authored Jan 21, 2025 by Baber
3 changed files
--- a/lm_eval/tasks/math500/README.md
+++ b/lm_eval/tasks/math500/README.md
+# MATH
+ℹ️ This is the 0-shot variant! reproducing https://huggingface.co/datasets/meta-llama/Llama-3.1-8B-Instruct-evals/viewer/Llama-3.1-8B-Instruct-evals__math__details?row=0
+## Paper
+Measuring Mathematical Problem Solving With the MATH Dataset
+https://arxiv.org/abs/2103.03874
+Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations.
+NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is calculated using the `sympy` library. This requires additional dependencies, which can be installed via the `lm-eval[math]` extra.
+Homepage: https://github.com/hendrycks/math
+## Citation
+```
+@article{hendrycksmath2021,
+  title={Measuring Mathematical Problem Solving With the MATH Dataset},
+  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
+  journal={NeurIPS},
+  year={2021}
+}
+@misc{2206.14858,
+Author = {Aitor Lewkowycz and Anders Andreassen and David Dohan and Ethan Dyer and Henryk Michalewski and Vinay Ramasesh and Ambrose Slone and Cem Anil and Imanol Schlag and Theo Gutman-Solo and Yuhuai Wu and Behnam Neyshabur and Guy Gur-Ari and Vedant Misra},
+Title = {Solving Quantitative Reasoning Problems with Language Models},
+Year = {2022},
+Eprint = {arXiv:2206.14858},
+}
+```
+### Groups and Tasks
+[//]: # (#### Groups)
+[//]: # ()
+[//]: # (- `llama_math`)
+#### Tasks
+- `llama_math_algebra`
+- `llama_math_counting_and_prob`
+- `llama_math_geometry`
+- `llama_math_intermediate_algebra`
+- `llama_math_num_theory`
+- `llama_math_prealgebra`
+- `llama_math_precalc`
+### Checklist
+The checklist is the following:
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+    * The implementation in the original paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is comparable to that provided in the paper, though not identical.
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+### Variant Wishlist
+- [ ] zero-shot variant
--- a/lm_eval/tasks/math500/math500.yaml
+++ b/lm_eval/tasks/math500/math500.yaml
+task: math500
+dataset_path: EleutherAI/hendrycks_math
+process_docs: !function utils.process_docs
+output_type: generate_until
+training_split: train
+test_split: test
+doc_to_text: "Solve the following math problem efficiently and clearly:\n\n- For simple problems (2 steps or fewer):\nProvide a concise solution with minimal explanation.\n\n- For complex problems (3 steps or more):\nUse this step-by-step format:\n\n## Step 1: [Concise description]\n[Brief explanation and calculations]\n\n## Step 2: [Concise description]\n[Brief explanation and calculations]\n\n...\n\nRegardless of the approach, always conclude with:\n\nTherefore, the final answer is: $\\\\boxed{answer}$. I hope it is correct.\n\nWhere [answer] is just the final number or expression that solves the problem.\n\nProblem: {{ problem }}"
+process_results: !function utils.process_results
+doc_to_target: "{{answer if few_shot is undefined else solution}}"
+generation_kwargs:
+#  until:
+#    - "Problem:"
+  max_gen_toks: 5120
+  do_sample: false
+  temperature: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+num_fewshot: 0
+metadata:
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/math500/utils.py
+++ b/lm_eval/tasks/math500/utils.py
+import re
+import signal
+from typing import Dict, List, Optional
+import datasets
+from lm_eval.utils import eval_logger
+try:
+    import sympy
+    from sympy.parsing.latex import parse_latex
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "`sympy` is required for generating translation task prompt templates. \
+please install sympy via pip install lm-eval[math] or pip install -e .[math]",
+    )
+template = "Solve the following math problem efficiently and clearly:\n\n- For simple problems (2 steps or fewer):\nProvide a concise solution with minimal explanation.\n\n- For complex problems (3 steps or more):\nUse this step-by-step format:\n\n## Step 1: [Concise description]\n[Brief explanation and calculations]\n\n## Step 2: [Concise description]\n[Brief explanation and calculations]\n\n...\n\nRegardless of the approach, always conclude with:\n\nTherefore, the final answer is: $\\\\boxed{answer}$. I hope it is correct.\n\nWhere [answer] is just the final number or expression that solves the problem.\n\nProblem: {{ problem }}"
+# taken from
+# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
+# def doc_to_text(doc: dict) -> str:
+#     return "Problem:" + "\n" + doc["problem"] + "\n\n" + "Solution:"
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["problem"],
+            "solution": doc["solution"],
+            "answer": normalize_final_answer(doc["answer"]),
+        }
+        if getattr(doc, "few_shot", None) is not None:
+            out_doc["few_shot"] = True
+        return out_doc
+    return dataset.map(_process_doc)
+# def list_fewshot_samples() -> list[dict]:
+#     return [
+#         {
+#             "problem": "Find the domain of the expression  $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}",
+#             "solution": "The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.",
+#             "few_shot": "1",
+#         },
+#         {
+#             "problem": "If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$",
+#             "solution": "We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$\nFinal Answer: The final answer is $24$. I hope it is correct.",
+#             "few_shot": "1",
+#         },
+#         {
+#             "problem": "Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?",
+#             "solution": "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight.  If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight.  Equating this to 480 pounds, we can solve for $n$:\n\\begin{align*}\n30n&=480\\\n\\Rightarrow\\qquad n&=480/30=\\boxed{16}\n\\end{align*}\nFinal Answer: The final answer is $16$. I hope it is correct.",
+#             "few_shot": "1",
+#         },
+#         {
+#             "problem": "If the system of equations\n\n\\begin{align*}\n6x-4y&=a,\\\n6y-9x &=b.\n\\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,\nfind $\\frac{a}{b},$ assuming $b$ is nonzero.",
+#             "solution": "If we multiply the first equation by $-\\frac{3}{2}$, we obtain\n\n$$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have\n\n$$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$\nFinal Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.",
+#             "few_shot": "1",
+#         },
+#     ]
+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+    candidates = results[0]
+    answer = normalize_final_answer(remove_boxed(last_boxed_only_string(candidates)))
+    if is_equiv(answer, doc["answer"]):
+        retval = 1
+    else:
+        retval = 0
+    results = {
+        "exact_match": retval,
+    }
+    return results
+def last_boxed_only_string(string: str) -> Optional[str]:
+    idx = string.rfind("\\boxed")
+    if "\\boxed " in string:
+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx : right_brace_idx + 1]
+    return retval
+def remove_boxed(s: str) -> str:
+    if not s:
+        return ""
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[: len(left)] == left
+        return s[len(left) :]
+    left = "\\boxed{"
+    assert s[: len(left)] == left
+    assert s[-1] == "}"
+    return s[len(left) : -1]
+class timeout:
+    def __init__(self, seconds=1, error_message="Timeout"):
+        self.seconds = seconds
+        self.error_message = error_message
+    def handle_timeout(self, signum, frame):
+        raise TimeoutError(self.error_message)
+    def __enter__(self):
+        signal.signal(signal.SIGALRM, self.handle_timeout)
+        signal.alarm(self.seconds)
+    def __exit__(self, type, value, traceback):
+        signal.alarm(0)
+def is_equiv(x1: str, x2: str) -> bool:
+    """
+    x1 and x2 are normalized latex string
+    """
+    try:
+        with timeout(seconds=5):
+            try:
+                parsed_x1 = parse_latex(x1)
+                parsed_x2 = parse_latex(x2)
+            except (
+                sympy.parsing.latex.errors.LaTeXParsingError,
+                sympy.SympifyError,
+                TypeError,
+            ):
+                eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
+                return False
+            try:
+                diff = parsed_x1 - parsed_x2
+            except TypeError:
+                eval_logger.debug(f"couldn't subtract {x1} and {x2}")
+                return False
+            try:
+                if sympy.simplify(diff) == 0:
+                    return True
+                else:
+                    return False
+            except ValueError:
+                eval_logger.debug(
+                    f"Had some trouble simplifying when comparing {x1} and {x2}"
+                )
+    except TimeoutError:
+        eval_logger.debug(f"Timed out comparing {x1} and {x2}")
+        return False
+    except ImportError as e:
+        eval_logger.error(e)
+        raise
+    except Exception as e:
+        eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
+        return False
+def get_unnormalized_answer(text: str) -> str:
+    INVALID_ANSWER = "[invalidanswer]"
+    end_seq = "I hope it is correct."
+    text += end_seq
+    match = re.search(
+        r"Final Answer: The final answer is(.*?). I hope it is correct.",
+        text,
+    )
+    if match:
+        return match.group(1).strip()
+    else:
+        return INVALID_ANSWER
+SUBSTITUTIONS = [
+    ("an ", ""),
+    ("a ", ""),
+    (".$", "$"),
+    ("\\$", ""),
+    (r"\ ", ""),
+    (" ", ""),
+    ("mbox", "text"),
+    (",\\text{and}", ","),
+    ("\\text{and}", ","),
+    ("\\text{m}", "\\text{}"),
+]
+REMOVED_EXPRESSIONS = [
+    "square",
+    "ways",
+    "integers",
+    "dollars",
+    "mph",
+    "inches",
+    "ft",
+    "hours",
+    "km",
+    "units",
+    "\\ldots",
+    "sue",
+    "points",
+    "feet",
+    "minutes",
+    "digits",
+    "cents",
+    "degrees",
+    "cm",
+    "gm",
+    "pounds",
+    "meters",
+    "meals",
+    "edges",
+    "students",
+    "childrentickets",
+    "multiples",
+    "\\text{s}",
+    "\\text{.}",
+    "\\text{\ns}",
+    "\\text{}^2",
+    "\\text{}^3",
+    "\\text{\n}",
+    "\\text{}",
+    r"\mathrm{th}",
+    r"^\circ",
+    r"^{\circ}",
+    r"\;",
+    r",\!",
+    "{,}",
+    '"',
+    "\\dots",
+]
+def normalize_final_answer(final_answer: Optional[str]) -> str:
+    """
+    Normalize a final answer to a quantitative reasoning question.
+    Copied character for character from appendix D of Lewkowycz et al. (2022)
+    """
+    if not final_answer:
+        return ""
+    final_answer = final_answer.split("=")[-1]
+    for before, after in SUBSTITUTIONS:
+        final_answer = final_answer.replace(before, after)
+    for expr in REMOVED_EXPRESSIONS:
+        final_answer = final_answer.replace(expr, "")
+    # Extract answer that is in LaTeX math, is bold,
+    # is surrounded by a box, etc.
+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
+    # Normalize shorthand TeX:
+    #  \fracab -> \frac{a}{b}
+    #  \frac{abc}{bef} -> \frac{abc}{bef}
+    #  \fracabc -> \frac{a}{b}c
+    #  \sqrta -> \sqrt{a}
+    #  \sqrtab -> sqrt{a}b
+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
+    final_answer = final_answer.replace("$", "")
+    # Normalize 100,000 -> 100000
+    if final_answer.replace(",", "").isdigit():
+        final_answer = final_answer.replace(",", "")
+    return final_answer