add math500 pass@1

6bc8b8cc · Baber · 80b2244e · 6bc8b8cc · 6bc8b8cc
Commit 6bc8b8cc authored Jan 21, 2025 by Baber
Show whitespace changes
Inline Side-by-side

Showing with 45 additions and 17 deletions

lm_eval/tasks/math500/math500.yaml lm_eval/tasks/math500/math500.yaml +13 -3

lm_eval/tasks/math500/utils.py lm_eval/tasks/math500/utils.py +32 -14

No files found.
--- a/lm_eval/tasks/math500/math500.yaml
+++ b/lm_eval/tasks/math500/math500.yaml
@@ -4,16 +4,25 @@ process_docs: !function utils.process_docs
 output_type: generate_until
 test_split: test
 doc_to_text: "Solve the following math problem efficiently and clearly:\n\n- For simple problems (2 steps or fewer):\nProvide a concise solution with minimal explanation.\n\n- For complex problems (3 steps or more):\nUse this step-by-step format:\n\n## Step 1: [Concise description]\n[Brief explanation and calculations]\n\n## Step 2: [Concise description]\n[Brief explanation and calculations]\n\n...\n\nRegardless of the approach, always conclude with:\n\nTherefore, the final answer is: $\\\\boxed{answer}$. I hope it is correct.\n\nWhere [answer] is just the final number or expression that solves the problem.\n\nProblem: {{ problem }}"
-process_results: !function utils.process_results
+#process_results: !function utils.process_results
 doc_to_target: "{{answer if few_shot is undefined else solution}}"
+process_results: !function utils.process_results
 repeats: 2
 generation_kwargs:
  until: []
-  max_gen_toks: 5120
+  max_gen_toks: 1024
  do_sample: true
+  top_p: 0.95
  temperature: 0.6
+filter_list:
+  - name: "pass@1"
+    filter:
+      - function: "custom"
+        filter_fn: !function utils.filter_final_answer
+      - function: "custom"
+        filter_fn: !function utils.get_metric
 metric_list:
-  - metric: exact_match
+  - metric: acc
    aggregation: mean
    higher_is_better: true
 num_fewshot: 0
@@ -21,3 +30,4 @@ metadata:
  version: 1.0
 dataset_kwargs:
  trust_remote_code: true
--- a/lm_eval/tasks/math500/utils.py
+++ b/lm_eval/tasks/math500/utils.py
@@ -63,21 +63,39 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
 #     ]
-# calculate pass@1 for all results
+def filter_final_answer(resps: list[list[str]], docs) -> list[list[str]]:
-def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+    answer = []
-    candidates = results[0]
+    for resp in resps:
+        answer.append(
+            [
+                normalize_final_answer(remove_boxed(last_boxed_only_string(r[0])))
+                for r in resp
+            ]
+        )
+    return answer
-    answer = normalize_final_answer(remove_boxed(last_boxed_only_string(candidates)))
+def process_results(docs: dict, resps: list[dict]) -> dict:
+    return resps[0]
-    if is_equiv(answer, doc["answer"]):
+# calculate pass@1 for all results
+def get_metric(predictions: list[list[str]], references: list[dict]) -> Dict[str, int]:
+    res = []
+    for reference, candidates in zip(references, predictions):
+        for candidate in candidates:
+            answer = normalize_final_answer(
+                remove_boxed(last_boxed_only_string(candidate))
+            )
+            if is_equiv(answer, reference["answer"]):
                retval = 1
-    else:
-        retval = 0
                results = {
-        "exact_match": retval,
+                    "accuracy": retval,
                }
-    return results
+                res.append(results)
+                break
+        else:
+            res.append({"accuracy": 0})
+    return res
 def last_boxed_only_string(string: str) -> Optional[str]: