Commit 1a34d084 authored by baberabb's avatar baberabb
Browse files

add hendryck_math tasks; bugfixes

parent 57b20eef
...@@ -40,7 +40,9 @@ repos: ...@@ -40,7 +40,9 @@ repos:
- id: codespell - id: codespell
exclude: > exclude: >
(?x)^( (?x)^(
.*\.json|ignore.txt .*\.json
|ignore.txt
|lm_eval/tasks/.*
)$ )$
args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt] args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
- repo: https://github.com/pre-commit/mirrors-mypy - repo: https://github.com/pre-commit/mirrors-mypy
......
...@@ -46,14 +46,14 @@ class Sampler: ...@@ -46,14 +46,14 @@ class Sampler:
) )
+ self.target_delimiter + self.target_delimiter
+ ( + (
self.doc_to_target(doc)[0] str(self.doc_to_target(doc)[0])
if type(self.doc_to_target(doc)) is list if type(self.doc_to_target(doc)) is list
else self.doc_to_target(doc) else self.doc_to_target(doc)
if ( if (
self.config.doc_to_choice is None self.config.doc_to_choice is None
or type(self.doc_to_target(doc)) is str or type(self.doc_to_target(doc)) is str
) )
else self.doc_to_choice(doc)[self.doc_to_target(doc)] else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
) )
for doc in selected_docs for doc in selected_docs
] ]
......
...@@ -581,7 +581,7 @@ class ConfigurableTask(Task): ...@@ -581,7 +581,7 @@ class ConfigurableTask(Task):
INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()} INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
metric_agg = get_default_aggregation(metric_name) metric_agg = get_default_aggregation(metric_name)
eval_logger.warning( eval_logger.warning(
f"metric {metric_name} is defined, but aggregation is not. " f"[Task: {self._config.task}] metric {metric_name} is defined, but aggregation is not. "
f"using default " f"using default "
f"aggregation={INV_AGG_REGISTRY[metric_agg]}" f"aggregation={INV_AGG_REGISTRY[metric_agg]}"
) )
...@@ -593,7 +593,7 @@ class ConfigurableTask(Task): ...@@ -593,7 +593,7 @@ class ConfigurableTask(Task):
] ]
else: else:
eval_logger.warning( eval_logger.warning(
f"metric {metric_name} is defined, but higher_is_better is not. " f"[Task: {self._config.task}] metric {metric_name} is defined, but higher_is_better is not. "
f"using default " f"using default "
f"higher_is_better={is_higher_better(metric_name)}" f"higher_is_better={is_higher_better(metric_name)}"
) )
...@@ -838,7 +838,10 @@ class ConfigurableTask(Task): ...@@ -838,7 +838,10 @@ class ConfigurableTask(Task):
and (target_string[0] == "[") and (target_string[0] == "[")
and (target_string[-1] == "]") and (target_string[-1] == "]")
): ):
return ast.literal_eval(target_string) try:
return ast.literal_eval(target_string)
except (SyntaxError, ValueError):
return target_string
else: else:
return target_string return target_string
elif type(doc_to_target) == list: elif type(doc_to_target) == list:
......
group: hendrycks_math
task:
- math_algebra
- math_counting_and_prob
- math_geometry
- math_intermediate_algebra
- math_num_theory
- math_prealgebra
- math_precalc
# MATH
ℹ️ This is the 4-shot variant!
## Paper
Measuring Mathematical Problem Solving With the MATH Dataset
https://arxiv.org/abs/2103.03874
Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations.
NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) paper and exact match equivalence is calculated using the `sympy` library
Homepage: https://github.com/hendrycks/math
## Citation
```
@article{hendrycksmath2021,
title={Measuring Mathematical Problem Solving With the MATH Dataset},
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
journal={NeurIPS},
year={2021}
}
```
### Groups, Benchmarks and Tasks
#### Benchmarks
- `hendrycks_math`
#### Groups
- `math_word_problems`
- `greedy_until`
#### Tasks
- `math_algebra`
- `math_counting_and_prob`
- `math_geometry`
- `math_intermediate_algebra`
- `math_num_theory`
- `math_prealgebra`
- `math_precalc`
### Checklist
The checklist is the following:
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
* The implementation in the original paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is comparable to that provided in the paper, though not identical.
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
### Variant Wishlist
- [ ] zero-shot variant
group:
- math_word_problems
task: math_algebra
dataset_path: baber/hendrycks_math
process_docs: !function utils.process_docs
dataset_name: algebra
output_type: greedy_until
training_split: train
test_split: test
doc_to_text: !function utils.doc_to_text
process_results: !function utils.process_results
doc_to_target: "{{answer}}"
generation_kwargs:
until:
- "Problem:"
do_sample: false
temperature: 0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
include: math_algebra.yaml
dataset_name: counting_and_probability
task: math_counting_and_prob
include: math_algebra.yaml
dataset_name: geometry
task: math_geometry
include: math_algebra.yaml
dataset_name: intermediate_algebra
task: math_intermediate_algebra
include: math_algebra.yaml
dataset_name: number_theory
task: math_num_theory
include: math_algebra.yaml
dataset_name: prealgebra
task: math_prealgebra
include: math_algebra.yaml
dataset_name: precalculus
task: math_precalc
import datasets
import re
import sympy
from sympy.parsing.latex import parse_latex
import signal
from lm_eval.logger import eval_logger
from typing import Optional
# taken from
# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
def doc_to_text(doc: dict) -> str:
PROMPT = r"""Problem:
Find the domain of the expression $\frac{\sqrt{x-2}}{\sqrt{5-x}}$.}
Solution:
The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\boxed{[2,5)}$.
Final Answer: The final answer is $[2,5)$. I hope it is correct.
Problem:
If $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find $\det (\mathbf{A} \mathbf{B}).$
Solution:
We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B}) = (2)(12) = \boxed{24}.$
Final Answer: The final answer is $24$. I hope it is correct.
Problem:
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
Solution:
If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$:
\begin{align*}
30n&=480\\
\Rightarrow\qquad n&=480/30=\boxed{16}
\end{align*}
Final Answer: The final answer is $16$. I hope it is correct.
Problem:
If the system of equations
\begin{align*}
6x-4y&=a,\\
6y-9x &=b.
\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,
find $\frac{a}{b},$ assuming $b$ is nonzero.
Solution:
If we multiply the first equation by $-\frac{3}{2}$, we obtain
$$6y-9x=-\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have
$$-\frac{3}{2}a=b\Rightarrow\frac{a}{b}=\boxed{-\frac{2}{3}}.$$
Final Answer: The final answer is $-\frac{2}{3}$. I hope it is correct."""
return PROMPT + "\n\n" + "Problem:" + "\n" + doc["problem"] + "\n\n" + "Solution:"
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc: dict) -> dict:
out_doc = {
"problem": doc["problem"],
"solution": doc["solution"],
"answer": normalize_final_answer(
remove_boxed(last_boxed_only_string(doc["solution"]))
),
}
return out_doc
return dataset.map(_process_doc)
def process_results(doc: dict, results: list[str]) -> dict[str, int]:
candidates = results[0]
unnormalized_answer = get_unnormalized_answer(candidates)
answer = normalize_final_answer(unnormalized_answer)
if is_equiv(answer, doc["answer"]):
retval = 1
else:
retval = 0
results = {
"exact_match": retval,
}
return results
def last_boxed_only_string(string: str) -> Optional[str]:
idx = string.rfind("\\boxed")
if "\\boxed " in string:
return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
if idx < 0:
idx = string.rfind("\\fbox")
if idx < 0:
return None
i = idx
right_brace_idx = None
num_left_braces_open = 0
while i < len(string):
if string[i] == "{":
num_left_braces_open += 1
if string[i] == "}":
num_left_braces_open -= 1
if num_left_braces_open == 0:
right_brace_idx = i
break
i += 1
if right_brace_idx is None:
retval = None
else:
retval = string[idx : right_brace_idx + 1]
return retval
def remove_boxed(s: str) -> str:
if "\\boxed " in s:
left = "\\boxed "
assert s[: len(left)] == left
return s[len(left) :]
left = "\\boxed{"
assert s[: len(left)] == left
assert s[-1] == "}"
return s[len(left) : -1]
class timeout:
def __init__(self, seconds=1, error_message="Timeout"):
self.seconds = seconds
self.error_message = error_message
def handle_timeout(self, signum, frame):
raise TimeoutError(self.error_message)
def __enter__(self):
signal.signal(signal.SIGALRM, self.handle_timeout)
signal.alarm(self.seconds)
def __exit__(self, type, value, traceback):
signal.alarm(0)
def is_equiv(x1: str, x2: str) -> bool:
"""
x1 and x2 are normalized latex string
"""
try:
with timeout(seconds=5):
try:
parsed_x1 = parse_latex(x1)
parsed_x2 = parse_latex(x2)
except (
sympy.parsing.latex.errors.LaTeXParsingError,
sympy.SympifyError,
TypeError,
):
eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
return False
try:
diff = parsed_x1 - parsed_x2
except TypeError:
eval_logger.debug(f"couldn't subtract {x1} and {x2}")
return False
try:
if sympy.simplify(diff) == 0:
return True
else:
return False
except ValueError:
eval_logger.debug(
f"Had some trouble simplifying when comparing {x1} and {x2}"
)
except TimeoutError:
eval_logger.debug(f"Timed out comparing {x1} and {x2}")
return False
except Exception as e:
eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
return False
def get_unnormalized_answer(text: str) -> str:
INVALID_ANSWER = "[invalidanswer]"
end_seq = "I hope it is correct."
text += end_seq
match = re.search(
r"Final Answer: The final answer is(.*?). I hope it is correct.",
text,
)
if match:
return match.group(1).strip()
else:
return INVALID_ANSWER
SUBSTITUTIONS = [
("an ", ""),
("a ", ""),
(".$", "$"),
("\\$", ""),
(r"\ ", ""),
(" ", ""),
("mbox", "text"),
(",\\text{and}", ","),
("\\text{and}", ","),
("\\text{m}", "\\text{}"),
]
REMOVED_EXPRESSIONS = [
"square",
"ways",
"integers",
"dollars",
"mph",
"inches",
"ft",
"hours",
"km",
"units",
"\\ldots",
"sue",
"points",
"feet",
"minutes",
"digits",
"cents",
"degrees",
"cm",
"gm",
"pounds",
"meters",
"meals",
"edges",
"students",
"childrentickets",
"multiples",
"\\text{s}",
"\\text{.}",
"\\text{\ns}",
"\\text{}^2",
"\\text{}^3",
"\\text{\n}",
"\\text{}",
r"\mathrm{th}",
r"^\circ",
r"^{\circ}",
r"\;",
r",\!",
"{,}",
'"',
"\\dots",
]
def normalize_final_answer(final_answer: str) -> str:
"""
Normalize a final answer to a quantitative reasoning question.
Copied character for character from appendix D of Lewkowycz et al. (2022)
"""
final_answer = final_answer.split("=")[-1]
for before, after in SUBSTITUTIONS:
final_answer = final_answer.replace(before, after)
for expr in REMOVED_EXPRESSIONS:
final_answer = final_answer.replace(expr, "")
# Extract answer that is in LaTeX math, is bold,
# is surrounded by a box, etc.
final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
# Normalize shorthand TeX:
# \fracab -> \frac{a}{b}
# \frac{abc}{bef} -> \frac{abc}{bef}
# \fracabc -> \frac{a}{b}c
# \sqrta -> \sqrt{a}
# \sqrtab -> sqrt{a}b
final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
final_answer = final_answer.replace("$", "")
# Normalize 100,000 -> 100000
if final_answer.replace(",", "").isdigit():
final_answer = final_answer.replace(",", "")
return final_answer
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment