"test/vscode:/vscode.git/clone" did not exist on "286e6540a6efe9a838b2dee8f41e139a55084d77"
Unverified Commit 61ee8678 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

[leaderboard] math - sync with repo (#2817)

* sync with leaderboard

* also output old metric

* wrap old extraction in try except

* better log
parent 3816796e
...@@ -16,9 +16,12 @@ metric_list: ...@@ -16,9 +16,12 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
- metric: exact_match_original
aggregation: mean
higher_is_better: true
num_fewshot: 4 num_fewshot: 4
metadata: metadata:
version: 2.0 version: 3.0
dataset_kwargs: dataset_kwargs:
trust_remote_code: true trust_remote_code: true
fewshot_config: fewshot_config:
......
import logging import logging
import re from typing import Dict, List
import signal
from typing import Dict, List, Optional
import datasets import datasets
eval_logger = logging.getLogger(__name__)
try: try:
import re
import signal
import sympy import sympy
from math_verify import LatexExtractionConfig, parse, verify
from sympy.parsing.latex import parse_latex from sympy.parsing.latex import parse_latex
except ModuleNotFoundError: except ModuleNotFoundError:
raise ModuleNotFoundError( raise ModuleNotFoundError(
"`sympy` is required for generating translation task prompt templates. \ "`math-verify`, `sympy>=1.12`, and antlr4-python3-runtime==4.11 is required for generating translation task prompt templates. \
please install sympy via pip install lm-eval[math] or pip install -e .[math]", please install via pip install lm-eval[math] or pip install -e .[math]",
) )
INVALID_ANSWER = "[invalidanswer]" INVALID_ANSWER = "[invalidanswer]"
...@@ -31,9 +32,7 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: ...@@ -31,9 +32,7 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
out_doc = { out_doc = {
"problem": doc["problem"], "problem": doc["problem"],
"solution": doc["solution"], "solution": doc["solution"],
"answer": normalize_final_answer( "answer": remove_boxed(last_boxed_only_string(doc["solution"])),
remove_boxed(last_boxed_only_string(doc["solution"]))
),
} }
if getattr(doc, "few_shot", None) is not None: if getattr(doc, "few_shot", None) is not None:
out_doc["few_shot"] = True out_doc["few_shot"] = True
...@@ -73,32 +72,47 @@ def list_fewshot_samples() -> list[dict]: ...@@ -73,32 +72,47 @@ def list_fewshot_samples() -> list[dict]:
def process_results(doc: dict, results: List[str]) -> Dict[str, int]: def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
candidates = results[0] candidates = results[0]
parsed_candidate = parse(candidates)
parsed_answer = parse(doc["solution"], extraction_config=[LatexExtractionConfig()])
if verify(parsed_answer, parsed_candidate):
retval = 1
else:
retval = 0
try:
original = process_result_v1(doc, candidates)
except: # noqa: E722
original = 0
output = {
"exact_match": retval,
"exact_match_original": original,
}
return output
def process_result_v1(doc: dict, candidates: str) -> int:
# using the orginal answer extraction method
unnormalized_answer = get_unnormalized_answer(candidates) unnormalized_answer = get_unnormalized_answer(candidates)
answer = normalize_final_answer(unnormalized_answer) answer = normalize_final_answer(unnormalized_answer)
normalized_gold = normalize_final_answer(doc["answer"])
if answer == INVALID_ANSWER: if answer == INVALID_ANSWER:
return {"exact_match": 0} return 0
if answer.strip() == normalized_gold.strip() or is_equiv(answer, normalized_gold):
if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
retval = 1 retval = 1
else: else:
retval = 0 retval = 0
return retval
results = {
"exact_match": retval,
}
return results
def last_boxed_only_string(string: str) -> Optional[str]: def last_boxed_only_string(string: str) -> str:
idx = string.rfind("\\boxed") idx = string.rfind("\\boxed")
if "\\boxed " in string: if "\\boxed " in string:
return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0] return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
if idx < 0: if idx < 0:
idx = string.rfind("\\fbox") idx = string.rfind("\\fbox")
if idx < 0: if idx < 0:
return None return INVALID_ANSWER
i = idx i = idx
right_brace_idx = None right_brace_idx = None
...@@ -114,7 +128,7 @@ def last_boxed_only_string(string: str) -> Optional[str]: ...@@ -114,7 +128,7 @@ def last_boxed_only_string(string: str) -> Optional[str]:
i += 1 i += 1
if right_brace_idx is None: if right_brace_idx is None:
retval = None retval = INVALID_ANSWER
else: else:
retval = string[idx : right_brace_idx + 1] retval = string[idx : right_brace_idx + 1]
...@@ -157,6 +171,7 @@ def is_equiv(x1: str, x2: str) -> bool: ...@@ -157,6 +171,7 @@ def is_equiv(x1: str, x2: str) -> bool:
""" """
x1 and x2 are normalized latex string x1 and x2 are normalized latex string
""" """
eval_logger = logging.getLogger(__name__)
try: try:
with timeout(seconds=1): with timeout(seconds=1):
try: try:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment