allow repeats

80b2244e · Baber · f198094c · 80b2244e · 80b2244e
Commit 80b2244e authored Jan 21, 2025 by Baber
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 14 deletions

lm_eval/tasks/math500/math500.yaml lm_eval/tasks/math500/math500.yaml +3 -2

lm_eval/tasks/math500/utils.py lm_eval/tasks/math500/utils.py +13 -12

No files found.
--- a/lm_eval/tasks/math500/math500.yaml
+++ b/lm_eval/tasks/math500/math500.yaml
@@ -6,11 +6,12 @@ test_split: test
 doc_to_text: "Solve the following math problem efficiently and clearly:\n\n- For simple problems (2 steps or fewer):\nProvide a concise solution with minimal explanation.\n\n- For complex problems (3 steps or more):\nUse this step-by-step format:\n\n## Step 1: [Concise description]\n[Brief explanation and calculations]\n\n## Step 2: [Concise description]\n[Brief explanation and calculations]\n\n...\n\nRegardless of the approach, always conclude with:\n\nTherefore, the final answer is: $\\\\boxed{answer}$. I hope it is correct.\n\nWhere [answer] is just the final number or expression that solves the problem.\n\nProblem: {{ problem }}"
 process_results: !function utils.process_results
 doc_to_target: "{{answer if few_shot is undefined else solution}}"
+repeats: 2
 generation_kwargs:
  until: []
  max_gen_toks: 5120
-  do_sample: false
-  temperature: 0
+  do_sample: true
+  temperature: 0.6
 metric_list:
  - metric: exact_match
    aggregation: mean

--- a/lm_eval/tasks/math500/utils.py
+++ b/lm_eval/tasks/math500/utils.py
@@ -63,6 +63,7 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
 #     ]


+# calculate pass@1 for all results
 def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
    candidates = results[0]

@@ -184,18 +185,18 @@ def is_equiv(x1: str, x2: str) -> bool:
        return False


-def get_unnormalized_answer(text: str) -> str:
-    INVALID_ANSWER = "[invalidanswer]"
-    end_seq = "I hope it is correct."
-    text += end_seq
-    match = re.search(
-        r"Final Answer: The final answer is(.*?). I hope it is correct.",
-        text,
-    )
-    if match:
-        return match.group(1).strip()
-    else:
-        return INVALID_ANSWER
+# def get_unnormalized_answer(text: str) -> str:
+#     INVALID_ANSWER = "[invalidanswer]"
+#     end_seq = "I hope it is correct."
+#     text += end_seq
+#     match = re.search(
+#         r"Final Answer: The final answer is(.*?). I hope it is correct.",
+#         text,
+#     )
+#     if match:
+#         return match.group(1).strip()
+#     else:
+#         return INVALID_ANSWER


 SUBSTITUTIONS = [