import re from datasets import load_dataset, Dataset SYSTEM_PROMPT = """ Respond in the following format: ... ... """ def extract_hash_answer(text: str) -> str | None: if "####" not in text: return None return text.split("####")[1].strip().replace(",", "").replace("$", "") def extract_deepseek_r1_answer(text) -> str | None: words_to_check = ["applied_math", "Advanced-Math", "GSM8K_zh", 'EduChat-Math'] pattern = r'\b(' + '|'.join(map(re.escape, words_to_check)) + r')\b' has_match = bool(re.search(pattern, text['repo_name'], flags=re.IGNORECASE)) if has_match: pattern = r"\\boxed\{(.*)\}" match = re.search(pattern, text['output']) if match: return match.group(1) else: return None else: return None # uncomment middle messages for 1-shot prompting def get_gsm8k_questions(dataset='openai/gsm8k', split="train") -> Dataset: data = load_dataset(dataset, 'main')[split] # type: ignore data = data.map(lambda x: { # type: ignore 'prompt': [ {'role': 'system', 'content': SYSTEM_PROMPT}, {'role': 'user', 'content': x['question']} ], 'answer': extract_hash_answer(x['answer']) }, num_proc=16, remove_columns=["question"]) # type: ignore data = data.filter(lambda x: x['answer'] is not None, num_proc=16) # print("---", data[0]) return data # type: ignore def get_deepseek_r1_questions(dataset='Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT', split="train") -> Dataset: data = load_dataset(dataset)[split] # type: ignore data = data.map(lambda x: { 'prompt': [ {'role': 'system', 'content': SYSTEM_PROMPT}, {'role': 'user', 'content': x['instruction']} ], 'answer': extract_deepseek_r1_answer(x) }, num_proc=16, # type: ignore remove_columns=["instruction", "output", "repo_name", "prompt_tokens_len", "input", "reasoning_content_tokens_len", "score", "content_tokens_len"], ) data = data.filter(lambda x: x['answer'] is not None, num_proc=32) # type: ignore print("GET {} data in Chinese-DeepSeek-R1-Distill-data-110k-SFT".format(len(data))) return data # type: ignore def get_hiyoga(dataset='hiyouga/math12k', split='train')-> Dataset: data = load_dataset(dataset)[split] # type: ignore data = data.map(lambda x: { 'prompt': [ {'role': 'system', 'content': SYSTEM_PROMPT}, {'role': 'user', 'content': x['problem']} ], 'answer': x['answer'] }, remove_columns=["problem"], num_proc=16, ) data = data.filter(lambda x: x['answer'] is not None, num_proc=16) # print(len(data)) return data # type: ignore def get_unsloth_openmath(dataset="unsloth/OpenMathReasoning-mini", split='cot') -> Dataset: data = load_dataset(dataset)[split] data = data.map(lambda x: { 'prompt': [ {'role': 'system', 'content': SYSTEM_PROMPT}, {'role': 'user', 'content': x['problem']} ], 'answer': x['expected_answer'] }, remove_columns=["expected_answer", "problem_type", "problem_source", "generation_model", "pass_rate_72b_tir", "generated_solution", "inference_mode", "problem",], num_proc=16, ) data = data.filter(lambda x: x['answer'] is not None, num_proc=16) # print("len of unsloth", len(data)) # print("=====", data) return data # type: ignore def get_openr1_dapo_math(dataset="open-r1/DAPO-Math-17k-Processed", split="train") -> Dataset: data = load_dataset(dataset, "all")[split] data = data.map(lambda x: { 'prompt': [ {'role': 'system', 'content': SYSTEM_PROMPT}, {'role': 'user', 'content': x['prompt']} ], 'answer': x['solution'] }, remove_columns=["solution", "data_source", "source_prompt", "ability", "reward_model", "extra_info"], num_proc=16, ) data = data.filter(lambda x: x['answer'] is not None, num_proc=16) return data # type: ignore