Commit 6bc8b8cc authored by Baber's avatar Baber
Browse files

add math500 pass@1

parent 80b2244e
...@@ -4,16 +4,25 @@ process_docs: !function utils.process_docs ...@@ -4,16 +4,25 @@ process_docs: !function utils.process_docs
output_type: generate_until output_type: generate_until
test_split: test test_split: test
doc_to_text: "Solve the following math problem efficiently and clearly:\n\n- For simple problems (2 steps or fewer):\nProvide a concise solution with minimal explanation.\n\n- For complex problems (3 steps or more):\nUse this step-by-step format:\n\n## Step 1: [Concise description]\n[Brief explanation and calculations]\n\n## Step 2: [Concise description]\n[Brief explanation and calculations]\n\n...\n\nRegardless of the approach, always conclude with:\n\nTherefore, the final answer is: $\\\\boxed{answer}$. I hope it is correct.\n\nWhere [answer] is just the final number or expression that solves the problem.\n\nProblem: {{ problem }}" doc_to_text: "Solve the following math problem efficiently and clearly:\n\n- For simple problems (2 steps or fewer):\nProvide a concise solution with minimal explanation.\n\n- For complex problems (3 steps or more):\nUse this step-by-step format:\n\n## Step 1: [Concise description]\n[Brief explanation and calculations]\n\n## Step 2: [Concise description]\n[Brief explanation and calculations]\n\n...\n\nRegardless of the approach, always conclude with:\n\nTherefore, the final answer is: $\\\\boxed{answer}$. I hope it is correct.\n\nWhere [answer] is just the final number or expression that solves the problem.\n\nProblem: {{ problem }}"
process_results: !function utils.process_results #process_results: !function utils.process_results
doc_to_target: "{{answer if few_shot is undefined else solution}}" doc_to_target: "{{answer if few_shot is undefined else solution}}"
process_results: !function utils.process_results
repeats: 2 repeats: 2
generation_kwargs: generation_kwargs:
until: [] until: []
max_gen_toks: 5120 max_gen_toks: 1024
do_sample: true do_sample: true
top_p: 0.95
temperature: 0.6 temperature: 0.6
filter_list:
- name: "pass@1"
filter:
- function: "custom"
filter_fn: !function utils.filter_final_answer
- function: "custom"
filter_fn: !function utils.get_metric
metric_list: metric_list:
- metric: exact_match - metric: acc
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
num_fewshot: 0 num_fewshot: 0
...@@ -21,3 +30,4 @@ metadata: ...@@ -21,3 +30,4 @@ metadata:
version: 1.0 version: 1.0
dataset_kwargs: dataset_kwargs:
trust_remote_code: true trust_remote_code: true
...@@ -63,21 +63,39 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: ...@@ -63,21 +63,39 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
# ] # ]
# calculate pass@1 for all results def filter_final_answer(resps: list[list[str]], docs) -> list[list[str]]:
def process_results(doc: dict, results: List[str]) -> Dict[str, int]: answer = []
candidates = results[0] for resp in resps:
answer.append(
answer = normalize_final_answer(remove_boxed(last_boxed_only_string(candidates))) [
normalize_final_answer(remove_boxed(last_boxed_only_string(r[0])))
if is_equiv(answer, doc["answer"]): for r in resp
retval = 1 ]
else: )
retval = 0 return answer
def process_results(docs: dict, resps: list[dict]) -> dict:
return resps[0]
results = { # calculate pass@1 for all results
"exact_match": retval, def get_metric(predictions: list[list[str]], references: list[dict]) -> Dict[str, int]:
} res = []
return results for reference, candidates in zip(references, predictions):
for candidate in candidates:
answer = normalize_final_answer(
remove_boxed(last_boxed_only_string(candidate))
)
if is_equiv(answer, reference["answer"]):
retval = 1
results = {
"accuracy": retval,
}
res.append(results)
break
else:
res.append({"accuracy": 0})
return res
def last_boxed_only_string(string: str) -> Optional[str]: def last_boxed_only_string(string: str) -> Optional[str]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment