Unverified Commit 358adaf7 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

add math_verify to some tasks (#2686)

* add math_verify to minerva math

* add math_verify to benchmark

* fix error

* increment version
parent 52df63b7
...@@ -11,5 +11,8 @@ aggregate_metric_list: ...@@ -11,5 +11,8 @@ aggregate_metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
weight_by_size: true weight_by_size: true
- metric: math_verify
aggregation: mean
weight_by_size: true
metadata: metadata:
version: 1.0 version: 1.0
...@@ -63,3 +63,6 @@ If other tasks on this dataset are already supported: ...@@ -63,3 +63,6 @@ If other tasks on this dataset are already supported:
### Variant Wishlist ### Variant Wishlist
- [ ] zero-shot variant - [ ] zero-shot variant
### Changelog
version 2.0: (21-Feb-2025); added math_verify (extraction) metric. For details [see](https://huggingface.co/blog/math_verify_leaderboard)
...@@ -19,9 +19,12 @@ metric_list: ...@@ -19,9 +19,12 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
- metric: math_verify
aggregation: mean
higher_is_better: true
num_fewshot: 4 num_fewshot: 4
metadata: metadata:
version: 1.0 version: 2.0
dataset_kwargs: dataset_kwargs:
trust_remote_code: true trust_remote_code: true
fewshot_config: fewshot_config:
......
import re import re
import signal import signal
from importlib.metadata import version
from typing import Dict, List, Optional from typing import Dict, List, Optional
import datasets import datasets
...@@ -8,13 +9,17 @@ from lm_eval.utils import eval_logger ...@@ -8,13 +9,17 @@ from lm_eval.utils import eval_logger
try: try:
import antlr4
import sympy import sympy
from math_verify import parse, verify
from sympy.parsing.latex import parse_latex from sympy.parsing.latex import parse_latex
except ModuleNotFoundError:
raise ModuleNotFoundError( assert version("antlr4-python3-runtime").startswith("4.11")
"`sympy` is required for generating translation task prompt templates. \ except (ModuleNotFoundError, AssertionError) as e:
please install sympy via pip install lm-eval[math] or pip install -e .[math]", raise type(e)(
) "`sympy`, `math_verify` and `antlr4-python3-runtime==4.11` are required for generating translation task prompt templates. "
"Please install the required packages via pip install lm-eval[math] or pip install -e .[math]"
) from e
# taken from # taken from
...@@ -75,8 +80,13 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]: ...@@ -75,8 +80,13 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
else: else:
retval = 0 retval = 0
# math_verify
res = verify(parse(doc["answer"]), parse(candidates))
mathval = 1 if res else 0
results = { results = {
"exact_match": retval, "exact_match": retval,
"math_verify": mathval,
} }
return results return results
......
...@@ -66,7 +66,7 @@ ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"] ...@@ -66,7 +66,7 @@ ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"]
ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"] ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
neuronx = ["optimum[neuronx]"] neuronx = ["optimum[neuronx]"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"] mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"] math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"] multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
optimum = ["optimum[openvino]"] optimum = ["optimum[openvino]"]
promptsource = ["promptsource>=0.2.3"] promptsource = ["promptsource>=0.2.3"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment