tag: - math_word_problems task: arena_hard #dataset_path: gsm8k #dataset_name: main output_type: generate_until #training_split: train #fewshot_split: train doc_to_text: ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"] # TODO: need a different metric #doc_to_target: A, B #" {{answer.split('### ')[-1].rstrip()}}" metric_list: - metric: bypass aggregation: mean generation_kwargs: until: - "Question:" - "" - "<|im_end|>" do_sample: false temperature: 0.0 filter_list: - name: "test" filter: - function: "regex" regex_pattern: "[[([AB<>=]+)]]" - function: "take_first" repeats: 1 num_fewshot: 5