tag: - judge include: gsm8k_judge_1.yaml task: gsm8k_judge_2 output_type: generate_until output_path: #doc_to_text: "Question: {{question}}\nAnswer:" doc_to_text: 'Given the following question and reference answer, verify if the attempted answer is correct. If it is, return "The answer is Correct". If it is incorrect, return "The answer is Incorrect".\nQuestion: {{question}}\nReference Answer: {{answer}}\nAnswer Attempt: {{resp}}' target_delimiter: "\n" doc_to_target: "The answer is Correct" #" {{answer.split('### ')[-1].rstrip()}}" metric_list: - metric: exact_match aggregation: mean higher_is_better: true ignore_case: true ignore_punctuation: false regexes_to_ignore: - "," - "\\$" - "(?s).*#### " - "\\.$" generation_kwargs: until: - '<|start_header_id|>user<|end_header_id|>' - "Question:" - "" - "<|im_end|>" do_sample: false temperature: 0.0 repeats: 1 num_fewshot: 5 filter_list: - name: "test" filter: - function: "regex" regex_pattern: "The answer is (Correct|Incorrect)" ignore_punctuation: true ignore_case: true - function: "take_first" metadata: version: 3.0