#!/usr/bin/env python3 # -*- coding: utf-8 -*- import json import re import sys import argparse import os from collections import defaultdict def extract_reasoning_and_text(generated_text: str): """从 generated_text 中分离 部分和正文部分""" pattern = re.compile(r'(.*?)', re.DOTALL) match = pattern.search(generated_text) if match: reasoning = match.group(1).strip() text_part = pattern.sub('', generated_text).strip() else: reasoning = "" text_part = generated_text.strip() return reasoning, text_part def build_output_item(seq_idx: int, unique_id: str, solution: str, generated_text: str): """为单个条目构建输出 JSON 对象,seq_idx 为该 Level 内的顺序编号""" reasoning, answer_text = extract_reasoning_and_text(generated_text) content = [ { "internal": None, "type": "reasoning", "reasoning": reasoning, "signature": None, "redacted": False }, { "internal": None, "type": "text", "text": answer_text, "refusal": None } ] model_output = { "model": "qwen3-8B", "choices": [ { "message": { "content": content, "source": "generate", "metadata": None, "internal": None, "role": "assistant", "tool_calls": None, "model": "qwen3-8B" }, "stop_reason": "stop", "logprobs": None } ], "usage": { "input_tokens": 109, "output_tokens": 6708, "total_tokens": 6817, "input_tokens_cache_write": None, "input_tokens_cache_read": None, "reasoning_tokens": None }, "time": None, "metadata": None, "error": None } messages = [ { "content": content, "source": "generate", "metadata": None, "internal": None, "role": "assistant", "tool_calls": None, "model": "qwen3-8B" } ] output = { "index": seq_idx, # 使用该 Level 内的顺序编号 "model": "qwen3-8B", "model_output": model_output, "messages": messages, "metadata": { "question_id": unique_id, "solution": solution } } return output def main(): parser = argparse.ArgumentParser( description="将 perf.json 和 test.jsonl 转换为按 level 分组的 jsonl 文件" ) parser.add_argument( "--perf", required=True, help="输入的 perf.json 文件路径,生成的perf结果文件" ) parser.add_argument( "--test", required=True, help="输入的 test.jsonl 文件路径,数据集路径" ) parser.add_argument( "--output", default="math_500_Level_{level}.jsonl", help="输出文件路径模板(必须包含 {level})或输出目录。" "如果是目录,则会在该目录下生成 math_500_Level_{level}.jsonl 文件。" "例如:'output/level_{level}.jsonl' 或 'output_dir/'。" "默认:'math_500_Level_{level}.jsonl'" ) args = parser.parse_args() perf_file = args.perf test_file = args.test output_pattern = args.output # 1. 读取 test.jsonl,为每个 Level 内的条目分配顺序编号 # 存储映射:unique_id -> (level, seq_in_level, solution) id_map = {} level_counter = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0} # 每个 Level 当前的顺序编号(从0开始) with open(test_file, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: continue obj = json.loads(line) unique_id = obj.get('unique_id') solution = obj.get('solution', '') level = obj.get('level') if unique_id is None or level not in (1,2,3,4,5): continue seq = level_counter[level] id_map[unique_id] = (level, seq, solution) level_counter[level] += 1 # 2. 读取 perf.json with open(perf_file, 'r', encoding='utf-8') as f: data = json.load(f) generated_texts = data.get('generated_texts', []) dataset_metadata = data.get('dataset_metadata', []) if len(generated_texts) != len(dataset_metadata): print("Warning: generated_texts and dataset_metadata lengths differ", file=sys.stderr) # 3. 按 level 分组存储输出对象,使用该 Level 内的顺序编号作为 index level_outputs = defaultdict(list) for text, meta in zip(generated_texts, dataset_metadata): unique_id = meta.get('unique_id', '') if unique_id not in id_map: print(f"Warning: unique_id {unique_id} not found in test.jsonl, skipping", file=sys.stderr) continue level, seq, solution = id_map[unique_id] output_item = build_output_item(seq, unique_id, solution, text) level_outputs[level].append(output_item) # 4. 处理输出路径:如果不包含 {level},则视为目录(自动创建),使用默认文件名 if '{level}' not in output_pattern: # 将 output_pattern 视为目录路径 out_dir = output_pattern # 创建目录(如果不存在) os.makedirs(out_dir, exist_ok=True) output_pattern = os.path.join(out_dir, "math_500_Level {level}.jsonl") # 如果包含 {level},则原样使用,后续也会自动创建目录 # 5. 写入各 level 的 jsonl 文件 for level in range(1, 6): items = level_outputs.get(level, []) # 确保按 seq 升序排列 items.sort(key=lambda x: x['index']) # 生成输出文件路径 out_file = output_pattern.format(level=level) # 确保输出目录存在 out_dir = os.path.dirname(out_file) if out_dir: os.makedirs(out_dir, exist_ok=True) with open(out_file, 'w', encoding='utf-8') as f: for obj in items: f.write(json.dumps(obj, ensure_ascii=False) + '\n') print(f"Written {len(items)} items to {out_file}") if __name__ == "__main__": main()