Unverified Commit 61ee8678 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

[leaderboard] math - sync with repo (#2817)

* sync with leaderboard

* also output old metric

* wrap old extraction in try except

* better log
parent 3816796e
......@@ -16,9 +16,12 @@ metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
- metric: exact_match_original
aggregation: mean
higher_is_better: true
num_fewshot: 4
metadata:
version: 2.0
version: 3.0
dataset_kwargs:
trust_remote_code: true
fewshot_config:
......
import logging
import re
import signal
from typing import Dict, List, Optional
from typing import Dict, List
import datasets
eval_logger = logging.getLogger(__name__)
try:
import re
import signal
import sympy
from math_verify import LatexExtractionConfig, parse, verify
from sympy.parsing.latex import parse_latex
except ModuleNotFoundError:
raise ModuleNotFoundError(
"`sympy` is required for generating translation task prompt templates. \
please install sympy via pip install lm-eval[math] or pip install -e .[math]",
"`math-verify`, `sympy>=1.12`, and antlr4-python3-runtime==4.11 is required for generating translation task prompt templates. \
please install via pip install lm-eval[math] or pip install -e .[math]",
)
INVALID_ANSWER = "[invalidanswer]"
......@@ -31,9 +32,7 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
out_doc = {
"problem": doc["problem"],
"solution": doc["solution"],
"answer": normalize_final_answer(
remove_boxed(last_boxed_only_string(doc["solution"]))
),
"answer": remove_boxed(last_boxed_only_string(doc["solution"])),
}
if getattr(doc, "few_shot", None) is not None:
out_doc["few_shot"] = True
......@@ -73,32 +72,47 @@ def list_fewshot_samples() -> list[dict]:
def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
candidates = results[0]
parsed_candidate = parse(candidates)
parsed_answer = parse(doc["solution"], extraction_config=[LatexExtractionConfig()])
if verify(parsed_answer, parsed_candidate):
retval = 1
else:
retval = 0
try:
original = process_result_v1(doc, candidates)
except: # noqa: E722
original = 0
output = {
"exact_match": retval,
"exact_match_original": original,
}
return output
def process_result_v1(doc: dict, candidates: str) -> int:
# using the orginal answer extraction method
unnormalized_answer = get_unnormalized_answer(candidates)
answer = normalize_final_answer(unnormalized_answer)
normalized_gold = normalize_final_answer(doc["answer"])
if answer == INVALID_ANSWER:
return {"exact_match": 0}
if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
return 0
if answer.strip() == normalized_gold.strip() or is_equiv(answer, normalized_gold):
retval = 1
else:
retval = 0
results = {
"exact_match": retval,
}
return results
return retval
def last_boxed_only_string(string: str) -> Optional[str]:
def last_boxed_only_string(string: str) -> str:
idx = string.rfind("\\boxed")
if "\\boxed " in string:
return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
if idx < 0:
idx = string.rfind("\\fbox")
if idx < 0:
return None
return INVALID_ANSWER
i = idx
right_brace_idx = None
......@@ -114,7 +128,7 @@ def last_boxed_only_string(string: str) -> Optional[str]:
i += 1
if right_brace_idx is None:
retval = None
retval = INVALID_ANSWER
else:
retval = string[idx : right_brace_idx + 1]
......@@ -157,6 +171,7 @@ def is_equiv(x1: str, x2: str) -> bool:
"""
x1 and x2 are normalized latex string
"""
eval_logger = logging.getLogger(__name__)
try:
with timeout(seconds=1):
try:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment