# "Training Verifiers to Solve Math Word Problems" # https://arxiv.org/abs/2110.14168 # State-of-the-art language models can match human performance on many tasks, but # they still struggle to robustly perform multi-step mathematical reasoning. To # diagnose the failures of current models and support research, we introduce GSM8K, # a dataset of 8.5K high quality linguistically diverse grade school math word problems. # We find that even the largest transformer models fail to achieve high test performance, # despite the conceptual simplicity of this problem distribution. # NOTE: See the official implementation of the task: # https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py # for how to make use of the dataset's calculator annotations in your language # model's sample/generation function. # Homepage: https://github.com/openai/grade-school-math # _CITATION = """ # @misc{cobbe2021training, # title={Training Verifiers to Solve Math Word Problems}, # author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman}, # year={2021}, # eprint={2110.14168}, # archivePrefix={arXiv}, # primaryClass={cs.LG} # } # """ task: gsm8k_yaml dataset_path: gsm8k dataset_name: main training_split: train test_split: test doc_to_target: "{{answer.split('### ')[-1]}}" use_prompt: "qa-basic:question-newline-answer" metric_list: - metric: exact_match aggregation: mean higher_is_better: true ignore_case: true ignore_punctuation: true delimiter: "\n" filter_list: - name: "just regex" filter: - function: "regex" regex_pattern: ".*" - function: "regex" regex_pattern: ".*" - name: "another regex" filter: - function: "regex" regex_pattern: ".*" - function: "regex" regex_pattern: ".*"