init

2a7c435f · sunzhq2 · 59a0ec90 · 2a7c435f · 2a7c435f · 2a7c435f
Commit 2a7c435f authored Apr 14, 2026 by sunzhq2
20 changed files
--- a/README.md
+++ b/README.md
-# llm-benchmarks
+## MATH-500数据集
+- https://www.modelscope.cn/datasets/AI-ModelScope/MATH-500
+## vllm 脚本修改
+- serve.py 修改
+```
+mv /usr/local/lib/python3.10/dist-packages/vllm/benchmarks/serve.py /usr/local/lib/python3.10/dist-packages/vllm/benchmarks/serve.py.bak
+cp ./utils/vllm-benchmarks/serve.py /usr/local/lib/python3.10/dist-packages/vllm/benchmarks/
+```
+- datasets.py修改
+```
+mv /usr/local/lib/python3.10/dist-packages/vllm/benchmarks/datasets.py /usr/local/lib/python3.10/dist-packages/vllm/benchmarks/datasets.py.bak
+cp ./utils/vllm-benchmarks/datasets.py /usr/local/lib/python3.10/dist-packages/vllm/benchmarks/
+```
+## evalscope 修改
+- evaluator.py
+```
+cp /usr/local/lib/python3.10/dist-packages/evalscope/evaluator/evaluator.py /usr/local/lib/python3.10/dist-packages/evalscope/evaluator/evaluator.py.bak
+cp ./utils/evalscope/evaluator.py /usr/local/lib/python3.10/dist-packages/evalscope/evaluator/
+```
+## 启动vllm服务
+- bash vllm_serve.sh
+## 性能和精度结果保存
+- bash run_benchmarks.sh
+## 数据集转换
+```
+cd tools
+bash run_convert.sh
+```
+## 跑精度
+```
+cd tools
+bash evalscope_test.sh
+```
\ No newline at end of file
--- a/evaluator.py
+++ b/evaluator.py
--- a/run_benchmarks.sh
+++ b/run_benchmarks.sh
+#!/bin/bash
+# run_performance_test.sh - 性能测试并保存详细回答
+set -e
+if [ ! -d "./results" ]; then
+    mkdir ./results
+fi
+rm -rf ./results/*
+# ========== 配置 ==========
+BASE_URL="http://0.0.0.0:8000"
+MODEL_NAME="qwen3-8B"
+LOCAL_MODEL_PATH="/data2/models/qwen3-8B"
+DATASET_JSONL="./MATH-500/test.jsonl"
+PROMPTS_FILE="./results/math500_prompts.jsonl"
+RESULT_DIR="./results/performance_results"
+RESULT_FILENAME="qwen3_8b_math500_perf.json"
+REQUEST_RATE=10
+# ===========================
+# 1. 检查服务
+echo ">>> 检查 vLLM 服务状态..."
+if ! curl -s "${BASE_URL}/health" > /dev/null; then
+    echo ">>> 错误：无法连接到 ${BASE_URL}"
+    exit 1
+fi
+echo ">>> 服务正常。"
+# 2. 生成 JSONL 提示词文件（包含 sampling_params）
+if [ -f "$PROMPTS_FILE" ]; then
+    echo ">>> 旧的提示词文件已存在，删除并重新生成。"
+    rm -f "$PROMPTS_FILE"
+fi
+echo ">>> 正在从 $DATASET_JSONL 生成 JSONL 格式提示词文件..."
+python - "$DATASET_JSONL" "$PROMPTS_FILE" << 'EOF'
+import sys, json
+with open(sys.argv[1], 'r', encoding='utf-8') as f_in, \
+     open(sys.argv[2], 'w', encoding='utf-8') as f_out:
+    for line in f_in:
+        item = json.loads(line)
+        prompt_text = f"{item['problem']}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}."
+        record = {
+            "prompt": prompt_text,
+            "metadata": {"unique_id": item["unique_id"]}
+        }
+        f_out.write(json.dumps(record, ensure_ascii=False) + '\n')
+EOF
+# "sampling_params": {"max_tokens": 8192},
+echo ">>> JSONL 提示词文件已保存至 $PROMPTS_FILE"
+# 3. 执行性能测试（带详细保存）
+echo ">>> 开始性能压测并保存详细结果..."
+vllm bench serve \
+    --backend vllm \
+    --base-url "$BASE_URL" \
+    --model "$MODEL_NAME" \
+    --tokenizer "$LOCAL_MODEL_PATH" \
+    --dataset-name custom \
+    --dataset-path "$PROMPTS_FILE" \
+    --num-prompts 500 \
+    --request-rate "$REQUEST_RATE" \
+    --save-result \
+    --save-detailed \
+    --result-dir "$RESULT_DIR" \
+    --result-filename "$RESULT_FILENAME" \
+    --custom-output-len 32768 \
+    --metadata \
+    --max-concurrency 256 \
+    --temperature 0.0
+    # --top_p 0.6
+echo ">>> 性能测试完成！结果目录: $RESULT_DIR"
+echo ">>> 汇总性能指标: $RESULT_DIR/$RESULT_FILENAME"
\ No newline at end of file
--- a/run_detailed_benchmark.py
+++ b/run_detailed_benchmark.py
+import json
+import re
+from typing import Dict, Any
+WRONG_IDS_FILE = "wrong_ids.txt"   # 保存错误样本的 unique_id
+def clean_think_tags(text: str) -> str:
+    """
+    移除模型可能生成的 \final ... \final 标签（非贪婪匹配），
+    以及常见的 <｜end▁of▁thinking｜>\n 等标签。
+    """
+    # 移除非贪婪匹配的 \final ... \final
+    text = re.sub(r'\\final.*?\\final', '', text, flags=re.DOTALL)
+    # 移除 \think ... \think 标签（如果存在）
+    text = re.sub(r'\\think.*?\\think', '', text, flags=re.DOTALL)
+    # 移除 <｜end▁of▁thinking｜>\n... 标签
+    text = re.sub(r'\\instant.*?(?=\n|$)', '', text, flags=re.DOTALL)
+    return text.strip()
+def extract_boxed_answer(text: str) -> str:
+    """
+    从文本中提取最后一个 \boxed{...} 的内容（更鲁棒的嵌套处理）。
+    支持多重嵌套花括号。
+    """
+    # 查找所有 \boxed{ 的位置
+    pattern = r'\\boxed\{'
+    matches = list(re.finditer(pattern, text))
+    if not matches:
+        # 如果没有 boxed，返回最后一行非空文本作为备选
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        return lines[-1] if lines else text
+    # 取最后一个 \boxed{ 开始的位置
+    last_match = matches[-1]
+    start = last_match.end() - 1  # 指向 '{' 的位置
+    # 使用栈匹配括号
+    stack = []
+    i = start
+    while i < len(text):
+        if text[i] == '{':
+            stack.append('{')
+        elif text[i] == '}':
+            stack.pop()
+            if not stack:
+                # 找到匹配的右括号
+                content = text[start+1:i]
+                return content.strip()
+        i += 1
+    # 如果括号不匹配，回退到简单正则
+    simple_match = re.search(r'\\boxed\{([^}]*(?:\{[^}]*\}[^}]*)*)\}', text)
+    if simple_match:
+        return simple_match.group(1).strip()
+    return text.strip()
+def normalize_math_answer(answer: str) -> str:
+    """
+    规范化数学答案，消除 LaTeX、空格、括号格式差异。
+    注意：此函数对分数、简单表达式有效，但可能不适用于所有符号答案。
+    """
+    # 移除空白
+    normalized = re.sub(r'\s+', '', answer)
+    # 去除 \left, \right
+    normalized = re.sub(r'\\left|\\right', '', normalized)
+    # 移除 LaTeX 分组花括号（保留可能的结构，但为了比较，可移除非必要的）
+    # 注意：此操作会改变结构，但对于分数等，比较 \frac1315 与 \frac{13}{15} 是等价的
+    normalized = normalized.replace('{', '').replace('}', '')
+    # 统一括号
+    normalized = normalized.replace('[', '(').replace(']', ')')
+    # 移除 \displaystyle
+    normalized = re.sub(r'\\displaystyle', '', normalized)
+    # 移除末尾的句点
+    normalized = normalized.rstrip('.')
+    return normalized.strip()
+def compare_answers(predicted: str, ground_truth: str) -> bool:
+    pred_norm = normalize_math_answer(predicted)
+    truth_norm = normalize_math_answer(ground_truth)
+    # 额外处理：如果预测答案包含在标准答案中（例如答案末尾多了标点）
+    if pred_norm == truth_norm:
+        return True
+    # 尝试数值比较（如果都是数字或简单表达式）
+    try:
+        # 简单的表达式求值（仅限基本算术）
+        pred_val = eval(pred_norm.replace('^', '**'), {"__builtins__": None}, {})
+        truth_val = eval(truth_norm.replace('^', '**'), {"__builtins__": None}, {})
+        return abs(pred_val - truth_val) < 1e-9
+    except:
+        pass
+    return False
+def load_ground_truth_jsonl(filepath: str) -> Dict[str, Dict[str, str]]:
+    data_dict = {}
+    with open(filepath, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                item = json.loads(line)
+                unique_id = item.get('unique_id')
+                if unique_id:
+                    data_dict[unique_id] = item
+                else:
+                    print(f"警告: 数据集中发现样本缺少 unique_id: {item.get('problem', '')[:50]}...")
+    return data_dict
+def main():
+    DATASET_JSONL = "/data1/sunzhq/llm-benchmark/MATH-500/test.jsonl"
+    VLLM_RESULT_JSON = "/data1/sunzhq/llm-benchmark/results-1/performance_results/qwen3_8b_math500_perf.json"
+    OUTPUT_EVAL_FILE = "evaluation_results.json"
+    print(f"正在加载数据集: {DATASET_JSONL}")
+    ground_truth_dict = load_ground_truth_jsonl(DATASET_JSONL)
+    print(f"数据集索引建立完成，共包含 {len(ground_truth_dict)} 个唯一样本。")
+    print(f"正在加载 vLLM 结果: {VLLM_RESULT_JSON}")
+    with open(VLLM_RESULT_JSON, 'r', encoding='utf-8') as f:
+        vllm_data = json.load(f)
+    generated_texts = vllm_data.get('generated_texts', [])
+    dataset_metadata_list = vllm_data.get('dataset_metadata', [])
+    if len(generated_texts) != len(dataset_metadata_list):
+        print(f"错误：生成结果数量 ({len(generated_texts)}) 与元数据数量 ({len(dataset_metadata_list)}) 不匹配！")
+        return
+    correct_count = 0
+    eval_results = []
+    wrong_ids = []
+    for idx, (raw_output, meta) in enumerate(zip(generated_texts, dataset_metadata_list)):
+        unique_id = meta.get('unique_id')
+        if not unique_id:
+            print(f"警告: 第 {idx} 个样本在 metadata 中没有找到 unique_id。跳过。")
+            continue
+        ground_truth_item = ground_truth_dict.get(unique_id)
+        if not ground_truth_item:
+            print(f"警告: 在原始数据集中未找到 ID 为 {unique_id} 的样本。跳过。")
+            continue
+        ground_truth = ground_truth_item.get('answer', '')
+        cleaned = clean_think_tags(raw_output)
+        predicted = extract_boxed_answer(cleaned)
+        is_correct = compare_answers(predicted, ground_truth)
+        if is_correct:
+            correct_count += 1
+        else:
+            wrong_ids.append(unique_id)
+        eval_results.append({
+            'index': idx,
+            'unique_id': unique_id,
+            'problem': ground_truth_item.get('problem', '')[:100] + '...',
+            'predicted': predicted,
+            'ground_truth': ground_truth,
+            'correct': is_correct
+        })
+        if (idx + 1) % 10 == 0:
+            print(f"已处理 {idx+1} 个样本...")
+    total_valid_samples = len(eval_results)
+    accuracy = correct_count / total_valid_samples if total_valid_samples > 0 else 0.0
+    print("\n" + "=" * 60)
+    print(f"评估完成！")
+    print(f"参与评估的有效样本数: {total_valid_samples}")
+    print(f"正确数量: {correct_count}")
+    print(f"准确率 (Accuracy): {accuracy:.4f} ({accuracy*100:.2f}%)")
+    # 保存详细结果
+    output_data = {
+        'summary': {
+            'total_processed': total_valid_samples,
+            'correct': correct_count,
+            'accuracy': accuracy
+        },
+        'details': eval_results
+    }
+    with open(OUTPUT_EVAL_FILE, 'w', encoding='utf-8') as f:
+        json.dump(output_data, f, ensure_ascii=False, indent=2)
+    print(f"详细评估结果已保存至: {OUTPUT_EVAL_FILE}")
+    # 保存错误 ID
+    if wrong_ids:
+        with open(WRONG_IDS_FILE, 'w', encoding='utf-8') as f:
+            for uid in wrong_ids:
+                f.write(uid + '\n')
+        print(f"错误 unique_id 列表已保存至: {WRONG_IDS_FILE} (共 {len(wrong_ids)} 个)")
+    else:
+        print("没有错误样本。")
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/run_detailed_benchmark_1.py
+++ b/run_detailed_benchmark_1.py
+import json
+import re
+from typing import Dict, Any
+WRONG_IDS_FILE = "wrong_ids.txt"   # 保存错误样本的 unique_id
+def clean_think_tags(text: str) -> str:
+    """
+    移除模型可能生成的 \final ... \final 标签（非贪婪匹配），
+    以及常见的 <｜end▁of▁thinking｜>\n 等标签。
+    """
+    # 移除非贪婪匹配的 \final ... \final
+    text = re.sub(r'\\final.*?\\final', '', text, flags=re.DOTALL)
+    # 移除 \think ... \think 标签（如果存在）
+    text = re.sub(r'\\think.*?\\think', '', text, flags=re.DOTALL)
+    # 移除 <｜end▁of▁thinking｜>\n... 标签
+    text = re.sub(r'\\instant.*?(?=\n|$)', '', text, flags=re.DOTALL)
+    return text.strip()
+def extract_boxed_answer(text: str) -> str:
+    """
+    从文本中提取最后一个 \boxed{...} 的内容（更鲁棒的嵌套处理）。
+    支持多重嵌套花括号。
+    """
+    # 查找所有 \boxed{ 的位置
+    pattern = r'\\boxed\{'
+    matches = list(re.finditer(pattern, text))
+    if not matches:
+        # 如果没有 boxed，返回最后一行非空文本作为备选
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        return lines[-1] if lines else text
+    # 取最后一个 \boxed{ 开始的位置
+    last_match = matches[-1]
+    start = last_match.end() - 1  # 指向 '{' 的位置
+    # 使用栈匹配括号
+    stack = []
+    i = start
+    while i < len(text):
+        if text[i] == '{':
+            stack.append('{')
+        elif text[i] == '}':
+            stack.pop()
+            if not stack:
+                # 找到匹配的右括号
+                content = text[start+1:i]
+                return content.strip()
+        i += 1
+    # 如果括号不匹配，回退到简单正则
+    simple_match = re.search(r'\\boxed\{([^}]*(?:\{[^}]*\}[^}]*)*)\}', text)
+    if simple_match:
+        return simple_match.group(1).strip()
+    return text.strip()
+def normalize_math_answer(answer: str) -> str:
+    """
+    规范化数学答案，消除 LaTeX、空格、括号格式差异。
+    注意：此函数对分数、简单表达式有效，但可能不适用于所有符号答案。
+    """
+    # 移除空白
+    normalized = re.sub(r'\s+', '', answer)
+    # 去除 \left, \right
+    normalized = re.sub(r'\\left|\\right', '', normalized)
+    # 移除 LaTeX 分组花括号（保留可能的结构，但为了比较，可移除非必要的）
+    # 注意：此操作会改变结构，但对于分数等，比较 \frac1315 与 \frac{13}{15} 是等价的
+    normalized = normalized.replace('{', '').replace('}', '')
+    # 统一括号
+    normalized = normalized.replace('[', '(').replace(']', ')')
+    # 移除 \displaystyle
+    normalized = re.sub(r'\\displaystyle', '', normalized)
+    # 移除末尾的句点
+    normalized = normalized.rstrip('.')
+    return normalized.strip()
+def compare_answers(predicted: str, ground_truth: str) -> bool:
+    pred_norm = normalize_math_answer(predicted)
+    truth_norm = normalize_math_answer(ground_truth)
+    # 额外处理：如果预测答案包含在标准答案中（例如答案末尾多了标点）
+    if pred_norm == truth_norm:
+        return True
+    # 尝试数值比较（如果都是数字或简单表达式）
+    try:
+        # 简单的表达式求值（仅限基本算术）
+        pred_val = eval(pred_norm.replace('^', '**'), {"__builtins__": None}, {})
+        truth_val = eval(truth_norm.replace('^', '**'), {"__builtins__": None}, {})
+        return abs(pred_val - truth_val) < 1e-9
+    except:
+        pass
+    return False
+def load_ground_truth_jsonl(filepath: str) -> Dict[str, Dict[str, str]]:
+    data_dict = {}
+    with open(filepath, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                item = json.loads(line)
+                unique_id = item.get('unique_id')
+                if unique_id:
+                    data_dict[unique_id] = item
+                else:
+                    print(f"警告: 数据集中发现样本缺少 unique_id: {item.get('problem', '')[:50]}...")
+    return data_dict
+def main():
+    DATASET_JSONL = "/data1/sunzhq/llm-benchmark/MATH-500/test.jsonl"
+    VLLM_RESULT_JSON = "/data1/sunzhq/llm-benchmark/results-1/performance_results/qwen3_8b_math500_perf.json"
+    OUTPUT_EVAL_FILE = "evaluation_results.json"
+    print(f"正在加载数据集: {DATASET_JSONL}")
+    ground_truth_dict = load_ground_truth_jsonl(DATASET_JSONL)
+    print(f"数据集索引建立完成，共包含 {len(ground_truth_dict)} 个唯一样本。")
+    print(f"正在加载 vLLM 结果: {VLLM_RESULT_JSON}")
+    with open(VLLM_RESULT_JSON, 'r', encoding='utf-8') as f:
+        vllm_data = json.load(f)
+    generated_texts = vllm_data.get('generated_texts', [])
+    dataset_metadata_list = vllm_data.get('dataset_metadata', [])
+    if len(generated_texts) != len(dataset_metadata_list):
+        print(f"错误：生成结果数量 ({len(generated_texts)}) 与元数据数量 ({len(dataset_metadata_list)}) 不匹配！")
+        return
+    correct_count = 0
+    eval_results = []
+    wrong_ids = []
+    for idx, (raw_output, meta) in enumerate(zip(generated_texts, dataset_metadata_list)):
+        unique_id = meta.get('unique_id')
+        if not unique_id:
+            print(f"警告: 第 {idx} 个样本在 metadata 中没有找到 unique_id。跳过。")
+            continue
+        ground_truth_item = ground_truth_dict.get(unique_id)
+        if not ground_truth_item:
+            print(f"警告: 在原始数据集中未找到 ID 为 {unique_id} 的样本。跳过。")
+            continue
+        ground_truth = ground_truth_item.get('answer', '')
+        cleaned = clean_think_tags(raw_output)
+        predicted = extract_boxed_answer(cleaned)
+        is_correct = compare_answers(predicted, ground_truth)
+        if is_correct:
+            correct_count += 1
+        else:
+            wrong_ids.append(unique_id)
+        eval_results.append({
+            'index': idx,
+            'unique_id': unique_id,
+            'problem': ground_truth_item.get('problem', '')[:100] + '...',
+            'predicted': predicted,
+            'ground_truth': ground_truth,
+            'correct': is_correct
+        })
+        if (idx + 1) % 10 == 0:
+            print(f"已处理 {idx+1} 个样本...")
+    total_valid_samples = len(eval_results)
+    accuracy = correct_count / total_valid_samples if total_valid_samples > 0 else 0.0
+    print("\n" + "=" * 60)
+    print(f"评估完成！")
+    print(f"参与评估的有效样本数: {total_valid_samples}")
+    print(f"正确数量: {correct_count}")
+    print(f"准确率 (Accuracy): {accuracy:.4f} ({accuracy*100:.2f}%)")
+    # 保存详细结果
+    output_data = {
+        'summary': {
+            'total_processed': total_valid_samples,
+            'correct': correct_count,
+            'accuracy': accuracy
+        },
+        'details': eval_results
+    }
+    with open(OUTPUT_EVAL_FILE, 'w', encoding='utf-8') as f:
+        json.dump(output_data, f, ensure_ascii=False, indent=2)
+    print(f"详细评估结果已保存至: {OUTPUT_EVAL_FILE}")
+    # 保存错误 ID
+    if wrong_ids:
+        with open(WRONG_IDS_FILE, 'w', encoding='utf-8') as f:
+            for uid in wrong_ids:
+                f.write(uid + '\n')
+        print(f"错误 unique_id 列表已保存至: {WRONG_IDS_FILE} (共 {len(wrong_ids)} 个)")
+    else:
+        print("没有错误样本。")
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/tools/convert_data.py
+++ b/tools/convert_data.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import json
+import re
+import sys
+import argparse
+import os
+from collections import defaultdict
+def extract_reasoning_and_text(generated_text: str):
+    """从 generated_text 中分离 <think> 部分和正文部分"""
+    pattern = re.compile(r'<think>(.*?)</think>', re.DOTALL)
+    match = pattern.search(generated_text)
+    if match:
+        reasoning = match.group(1).strip()
+        text_part = pattern.sub('', generated_text).strip()
+    else:
+        reasoning = ""
+        text_part = generated_text.strip()
+    return reasoning, text_part
+def build_output_item(seq_idx: int, unique_id: str, solution: str, generated_text: str):
+    """为单个条目构建输出 JSON 对象，seq_idx 为该 Level 内的顺序编号"""
+    reasoning, answer_text = extract_reasoning_and_text(generated_text)
+    content = [
+        {
+            "internal": None,
+            "type": "reasoning",
+            "reasoning": reasoning,
+            "signature": None,
+            "redacted": False
+        },
+        {
+            "internal": None,
+            "type": "text",
+            "text": answer_text,
+            "refusal": None
+        }
+    ]
+    model_output = {
+        "model": "qwen3-8B",
+        "choices": [
+            {
+                "message": {
+                    "content": content,
+                    "source": "generate",
+                    "metadata": None,
+                    "internal": None,
+                    "role": "assistant",
+                    "tool_calls": None,
+                    "model": "qwen3-8B"
+                },
+                "stop_reason": "stop",
+                "logprobs": None
+            }
+        ],
+        "usage": {
+            "input_tokens": 109,
+            "output_tokens": 6708,
+            "total_tokens": 6817,
+            "input_tokens_cache_write": None,
+            "input_tokens_cache_read": None,
+            "reasoning_tokens": None
+        },
+        "time": None,
+        "metadata": None,
+        "error": None
+    }
+    messages = [
+        {
+            "content": content,
+            "source": "generate",
+            "metadata": None,
+            "internal": None,
+            "role": "assistant",
+            "tool_calls": None,
+            "model": "qwen3-8B"
+        }
+    ]
+    output = {
+        "index": seq_idx,               # 使用该 Level 内的顺序编号
+        "model": "qwen3-8B",
+        "model_output": model_output,
+        "messages": messages,
+        "metadata": {
+            "question_id": unique_id,
+            "solution": solution
+        }
+    }
+    return output
+def main():
+    parser = argparse.ArgumentParser(
+        description="将 perf.json 和 test.jsonl 转换为按 level 分组的 jsonl 文件"
+    )
+    parser.add_argument(
+        "--perf",
+        required=True,
+        help="输入的 perf.json 文件路径,生成的perf结果文件"
+    )
+    parser.add_argument(
+        "--test",
+        required=True,
+        help="输入的 test.jsonl 文件路径，数据集路径"
+    )
+    parser.add_argument(
+        "--output",
+        default="math_500_Level_{level}.jsonl",
+        help="输出文件路径模板（必须包含 {level}）或输出目录。"
+             "如果是目录，则会在该目录下生成 math_500_Level_{level}.jsonl 文件。"
+             "例如：'output/level_{level}.jsonl' 或 'output_dir/'。"
+             "默认：'math_500_Level_{level}.jsonl'"
+    )
+    args = parser.parse_args()
+    perf_file = args.perf
+    test_file = args.test
+    output_pattern = args.output
+    # 1. 读取 test.jsonl，为每个 Level 内的条目分配顺序编号
+    #    存储映射：unique_id -> (level, seq_in_level, solution)
+    id_map = {}
+    level_counter = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}   # 每个 Level 当前的顺序编号（从0开始）
+    with open(test_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            obj = json.loads(line)
+            unique_id = obj.get('unique_id')
+            solution = obj.get('solution', '')
+            level = obj.get('level')
+            if unique_id is None or level not in (1,2,3,4,5):
+                continue
+            seq = level_counter[level]
+            id_map[unique_id] = (level, seq, solution)
+            level_counter[level] += 1
+    # 2. 读取 perf.json
+    with open(perf_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    generated_texts = data.get('generated_texts', [])
+    dataset_metadata = data.get('dataset_metadata', [])
+    if len(generated_texts) != len(dataset_metadata):
+        print("Warning: generated_texts and dataset_metadata lengths differ", file=sys.stderr)
+    # 3. 按 level 分组存储输出对象，使用该 Level 内的顺序编号作为 index
+    level_outputs = defaultdict(list)
+    for text, meta in zip(generated_texts, dataset_metadata):
+        unique_id = meta.get('unique_id', '')
+        if unique_id not in id_map:
+            print(f"Warning: unique_id {unique_id} not found in test.jsonl, skipping", file=sys.stderr)
+            continue
+        level, seq, solution = id_map[unique_id]
+        output_item = build_output_item(seq, unique_id, solution, text)
+        level_outputs[level].append(output_item)
+    # 4. 处理输出路径：如果不包含 {level}，则视为目录（自动创建），使用默认文件名
+    if '{level}' not in output_pattern:
+        # 将 output_pattern 视为目录路径
+        out_dir = output_pattern
+        # 创建目录（如果不存在）
+        os.makedirs(out_dir, exist_ok=True)
+        output_pattern = os.path.join(out_dir, "math_500_Level {level}.jsonl")
+    # 如果包含 {level}，则原样使用，后续也会自动创建目录
+    # 5. 写入各 level 的 jsonl 文件
+    for level in range(1, 6):
+        items = level_outputs.get(level, [])
+        # 确保按 seq 升序排列
+        items.sort(key=lambda x: x['index'])
+        # 生成输出文件路径
+        out_file = output_pattern.format(level=level)
+        # 确保输出目录存在
+        out_dir = os.path.dirname(out_file)
+        if out_dir:
+            os.makedirs(out_dir, exist_ok=True)
+        with open(out_file, 'w', encoding='utf-8') as f:
+            for obj in items:
+                f.write(json.dumps(obj, ensure_ascii=False) + '\n')
+        print(f"Written {len(items)} items to {out_file}")
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/tools/evalscope-data/configs/task_config.yaml
+++ b/tools/evalscope-data/configs/task_config.yaml
+analysis_report: false
+api_url: null
+chat_template: null
+dataset_args:
+  math_500:
+    aggregation: mean
+    data_statistics: null
+    dataset_id: AI-ModelScope/MATH-500
+    default_subset: default
+    description: '
+      ## Overview
+      MATH-500 is a curated subset of 500 problems from the MATH benchmark, designed
+      to evaluate the mathematical reasoning capabilities of language models. It covers
+      five difficulty levels across various mathematical topics including algebra,
+      geometry, number theory, and calculus.
+      ## Task Description
+      - **Task Type**: Mathematical Problem Solving
+      - **Input**: Mathematical problem statement
+      - **Output**: Step-by-step solution with final numerical answer
+      - **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)
+      ## Key Features
+      - 500 carefully selected problems from the full MATH dataset
+      - Five difficulty levels for fine-grained evaluation
+      - Problems cover algebra, geometry, number theory, probability, and more
+      - Each problem includes a reference solution
+      - Designed for efficient yet comprehensive math evaluation
+      ## Evaluation Notes
+      - Default configuration uses **0-shot** evaluation
+      - Answers should be formatted within `\boxed{}` for proper extraction
+      - Numeric equivalence checking for answer comparison
+      - Results can be broken down by difficulty level
+      - Commonly used for math reasoning benchmarking due to manageable size
+      '
+    eval_split: test
+    extra_params: {}
+    few_shot_num: 0
+    few_shot_prompt_template: null
+    few_shot_random: false
+    filters: null
+    force_redownload: false
+    metric_list:
+    - acc:
+        numeric: true
+    name: math_500
+    output_types:
+    - generation
+    paper_url: null
+    pretty_name: MATH-500
+    prompt_template: '{question}
+      Please reason step by step, and put your final answer within \boxed{{}}.'
+    query_template: null
+    review_timeout: null
+    sample_example: null
+    sandbox_config: {}
+    shuffle: false
+    shuffle_choices: false
+    subset_list:
+    - Level 1
+    - Level 2
+    - Level 3
+    - Level 4
+    - Level 5
+    system_prompt: null
+    tags:
+    - Math
+    - Reasoning
+    train_split: null
+dataset_dir: /root/.cache/modelscope/hub/datasets
+dataset_hub: modelscope
+datasets:
+- math_500
+debug: false
+enable_progress_tracker: false
+eval_backend: Native
+eval_batch_size: 1
+eval_config: null
+eval_type: mock_llm
+evalscope_version: 1.5.2.post1
+generation_config:
+  batch_size: 1
+ignore_errors: false
+judge_model_args: {}
+judge_strategy: auto
+judge_worker_num: 1
+limit: null
+model: text_generation
+model_args: {}
+model_id: qwen3-8B
+model_task: text_generation
+no_timestamp: true
+repeats: 1
+rerun_review: true
+sandbox_manager_config: {}
+sandbox_type: docker
+seed: 42
+stream: null
+timeout: null
+use_cache: /data1/sunzhq/llm-benchmark/tools/evalscope-data
+use_sandbox: false
+work_dir: /data1/sunzhq/llm-benchmark/tools/evalscope-data
--- a/tools/evalscope-data/logs/eval_log.log
+++ b/tools/evalscope-data/logs/eval_log.log
+2026-04-14 09:31:50 - evalscope - INFO: Running with native backend
+2026-04-14 09:31:50 - evalscope - INFO: Dump task config to /data1/sunzhq/llm-benchmark/tools/evalscope-data/configs/task_config.yaml
+2026-04-14 09:31:50 - evalscope - INFO: {
+    "model": "text_generation",
+    "model_id": "qwen3-8B",
+    "model_args": {},
+    "model_task": "text_generation",
+    "chat_template": null,
+    "datasets": [
+        "math_500"
+    ],
+    "dataset_args": {
+        "math_500": {
+            "name": "math_500",
+            "dataset_id": "AI-ModelScope/MATH-500",
+            "output_types": [
+                "generation"
+            ],
+            "subset_list": [
+                "Level 1",
+                "Level 2",
+                "Level 3",
+                "Level 4",
+                "Level 5"
+            ],
+            "default_subset": "default",
+            "few_shot_num": 0,
+            "few_shot_random": false,
+            "train_split": null,
+            "eval_split": "test",
+            "prompt_template": "{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
+            "few_shot_prompt_template": null,
+            "system_prompt": null,
+            "query_template": null,
+            "pretty_name": "MATH-500",
+            "description": "\n## Overview\n\nMATH-500 is a curated subset of 500 problems from the MATH benchmark, designed to evaluate the mathematical reasoning capabilities of language models. It covers five difficulty levels across various mathematical topics including algebra, geometry, number theory, and calculus.\n\n## Task Description\n\n- **Task Type**: Mathematical Problem Solving\n- **Input**: Mathematical problem statement\n- **Output**: Step-by-step solution with final numerical answer\n- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)\n\n## Key Features\n\n- 500 carefully selected problems from the full MATH dataset\n- Five difficulty levels for fine-grained evaluation\n- Problems cover algebra, geometry, number theory, probability, and more\n- Each problem includes a reference solution\n- Designed for efficient yet comprehensive math evaluation\n\n## Evaluation Notes\n\n- Default configuration uses **0-shot** evaluation\n- Answers should be formatted within `\\boxed{}` for proper extraction\n- Numeric equivalence checking for answer comparison\n- Results can be broken down by difficulty level\n- Commonly used for math reasoning benchmarking due to manageable size\n",
+            "paper_url": null,
+            "data_statistics": null,
+            "sample_example": null,
+            "tags": [
+                "Math",
+                "Reasoning"
+            ],
+            "filters": null,
+            "metric_list": [
+                {
+                    "acc": {
+                        "numeric": true
+                    }
+                }
+            ],
+            "aggregation": "mean",
+            "shuffle": false,
+            "shuffle_choices": false,
+            "force_redownload": false,
+            "review_timeout": null,
+            "extra_params": {},
+            "sandbox_config": {}
+        }
+    },
+    "dataset_dir": "/root/.cache/modelscope/hub/datasets",
+    "dataset_hub": "modelscope",
+    "repeats": 1,
+    "generation_config": {
+        "batch_size": 1
+    },
+    "eval_type": "mock_llm",
+    "eval_backend": "Native",
+    "eval_config": null,
+    "limit": null,
+    "eval_batch_size": 1,
+    "use_cache": "/data1/sunzhq/llm-benchmark/tools/evalscope-data",
+    "rerun_review": true,
+    "work_dir": "/data1/sunzhq/llm-benchmark/tools/evalscope-data",
+    "no_timestamp": true,
+    "enable_progress_tracker": false,
+    "ignore_errors": false,
+    "debug": false,
+    "seed": 42,
+    "api_url": null,
+    "timeout": null,
+    "stream": null,
+    "judge_strategy": "auto",
+    "judge_worker_num": 1,
+    "judge_model_args": {},
+    "analysis_report": false,
+    "use_sandbox": false,
+    "sandbox_type": "docker",
+    "sandbox_manager_config": {},
+    "evalscope_version": "1.5.2.post1"
+}
+2026-04-14 09:31:50 - evalscope - INFO: Start loading benchmark dataset: math_500
+2026-04-14 09:31:50 - evalscope - INFO: Start evaluating 5 subsets of math_500: ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+2026-04-14 09:31:50 - evalscope - INFO: Reusing predictions from /data1/sunzhq/llm-benchmark/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 1.jsonl, got 43 predictions, remaining 43 samples
+2026-04-14 09:31:50 - evalscope - WARNING: [Rerun review mode] Skipping 43 samples in subset 'Level 1' due to missing cached predictions. They will NOT be inferred.
+2026-04-14 09:31:50 - evalscope - INFO: Reusing predictions from /data1/sunzhq/llm-benchmark/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 2.jsonl, got 90 predictions, remaining 90 samples
+2026-04-14 09:31:50 - evalscope - WARNING: [Rerun review mode] Skipping 90 samples in subset 'Level 2' due to missing cached predictions. They will NOT be inferred.
+2026-04-14 09:31:50 - evalscope - INFO: Reusing predictions from /data1/sunzhq/llm-benchmark/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 3.jsonl, got 105 predictions, remaining 105 samples
+2026-04-14 09:31:50 - evalscope - WARNING: [Rerun review mode] Skipping 105 samples in subset 'Level 3' due to missing cached predictions. They will NOT be inferred.
+2026-04-14 09:31:50 - evalscope - INFO: Reusing predictions from /data1/sunzhq/llm-benchmark/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 4.jsonl, got 128 predictions, remaining 128 samples
+2026-04-14 09:31:50 - evalscope - WARNING: [Rerun review mode] Skipping 128 samples in subset 'Level 4' due to missing cached predictions. They will NOT be inferred.
+2026-04-14 09:31:50 - evalscope - INFO: Reusing predictions from /data1/sunzhq/llm-benchmark/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 5.jsonl, got 134 predictions, remaining 134 samples
+2026-04-14 09:31:50 - evalscope - WARNING: [Rerun review mode] Skipping 134 samples in subset 'Level 5' due to missing cached predictions. They will NOT be inferred.
+2026-04-14 09:31:50 - evalscope - INFO: Unified pool: 500 items to process, 0 already fully cached (500 total across all subsets).
+2026-04-14 09:31:52 - evalscope - INFO: Evaluating[math_500] 100%| 500/500 [Elapsed: 00:02 < Remaining: 00:00, 83.48it/s]
+2026-04-14 09:31:52 - evalscope - INFO: Unified pool finished for math_500.
+2026-04-14 09:31:52 - evalscope - INFO: Aggregating scores for subset: Level 1
+2026-04-14 09:31:52 - evalscope - INFO: Aggregating scores for subset: Level 2
+2026-04-14 09:31:52 - evalscope - INFO: Aggregating scores for subset: Level 3
+2026-04-14 09:31:52 - evalscope - INFO: Aggregating scores for subset: Level 4
+2026-04-14 09:31:52 - evalscope - INFO: Aggregating scores for subset: Level 5
+2026-04-14 09:31:52 - evalscope - INFO: Generating report...
+2026-04-14 09:31:52 - evalscope - INFO: 
+math_500 report table:
+----------+-----------+----------+----------+-------+---------+---------+
+| Model    | Dataset   | Metric   | Subset   |   Num |   Score | Cat.0   |
+==========+===========+==========+==========+=======+=========+=========+
+| qwen3-8B | math_500  | mean_acc | Level 1  |    43 |  0.9535 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 2  |    90 |  0.9889 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 3  |   105 |  0.9524 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 4  |   128 |  0.9531 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 5  |   134 |  0.8881 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | OVERALL  |   500 |  0.942  | -       |
+----------+-----------+----------+----------+-------+---------+---------+ 
+2026-04-14 09:31:52 - evalscope - INFO: Skipping report analysis (`analysis_report=False`).
+2026-04-14 09:31:52 - evalscope - INFO: Dump report to: /data1/sunzhq/llm-benchmark/tools/evalscope-data/reports/qwen3-8B/math_500.json 
+2026-04-14 09:31:52 - evalscope - INFO: Benchmark math_500 evaluation finished.
+2026-04-14 09:31:52 - evalscope - INFO: Running[eval] 100%| 1/1 [Elapsed: 00:02 < Remaining: 00:00,  2.69s/benchmark]
+2026-04-14 09:31:52 - evalscope - INFO: Overall report table: 
+----------+-----------+----------+----------+-------+---------+---------+
+| Model    | Dataset   | Metric   | Subset   |   Num |   Score | Cat.0   |
+==========+===========+==========+==========+=======+=========+=========+
+| qwen3-8B | math_500  | mean_acc | Level 1  |    43 |  0.9535 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 2  |    90 |  0.9889 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 3  |   105 |  0.9524 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 4  |   128 |  0.9531 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 5  |   134 |  0.8881 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | OVERALL  |   500 |  0.942  | -       |
+----------+-----------+----------+----------+-------+---------+---------+ 
+2026-04-14 09:31:53 - evalscope - INFO: HTML report generated: /data1/sunzhq/llm-benchmark/tools/evalscope-data/reports/report.html
+2026-04-14 09:31:53 - evalscope - INFO: Finished evaluation for qwen3-8B on ['math_500']
+2026-04-14 09:31:53 - evalscope - INFO: Output directory: /data1/sunzhq/llm-benchmark/tools/evalscope-data
--- a/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 1.jsonl
+++ b/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 1.jsonl
--- a/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 2.jsonl
+++ b/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 2.jsonl
--- a/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 3.jsonl
+++ b/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 3.jsonl
--- a/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 4.jsonl
+++ b/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 4.jsonl
--- a/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 5.jsonl
+++ b/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 5.jsonl
--- a/tools/evalscope-data/reports/qwen3-8B/math_500.json
+++ b/tools/evalscope-data/reports/qwen3-8B/math_500.json
+{
+    "name": "qwen3-8B@math_500",
+    "dataset_name": "math_500",
+    "dataset_pretty_name": "MATH-500",
+    "dataset_description": "\n## Overview\n\nMATH-500 is a curated subset of 500 problems from the MATH benchmark, designed to evaluate the mathematical reasoning capabilities of language models. It covers five difficulty levels across various mathematical topics including algebra, geometry, number theory, and calculus.\n\n## Task Description\n\n- **Task Type**: Mathematical Problem Solving\n- **Input**: Mathematical problem statement\n- **Output**: Step-by-step solution with final numerical answer\n- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)\n\n## Key Features\n\n- 500 carefully selected problems from the full MATH dataset\n- Five difficulty levels for fine-grained evaluation\n- Problems cover algebra, geometry, number theory, probability, and more\n- Each problem includes a reference solution\n- Designed for efficient yet comprehensive math evaluation\n\n## Evaluation Notes\n\n- Default configuration uses **0-shot** evaluation\n- Answers should be formatted within `\\boxed{}` for proper extraction\n- Numeric equivalence checking for answer comparison\n- Results can be broken down by difficulty level\n- Commonly used for math reasoning benchmarking due to manageable size\n",
+    "model_name": "qwen3-8B",
+    "score": 0.942,
+    "metrics": [
+        {
+            "name": "mean_acc",
+            "num": 500,
+            "score": 0.942,
+            "macro_score": 0.942,
+            "categories": [
+                {
+                    "name": [
+                        "default"
+                    ],
+                    "num": 500,
+                    "score": 0.942,
+                    "macro_score": 0.9472,
+                    "subsets": [
+                        {
+                            "name": "Level 1",
+                            "score": 0.9535,
+                            "num": 43
+                        },
+                        {
+                            "name": "Level 2",
+                            "score": 0.9889,
+                            "num": 90
+                        },
+                        {
+                            "name": "Level 3",
+                            "score": 0.9524,
+                            "num": 105
+                        },
+                        {
+                            "name": "Level 4",
+                            "score": 0.9531,
+                            "num": 128
+                        },
+                        {
+                            "name": "Level 5",
+                            "score": 0.8881,
+                            "num": 134
+                        }
+                    ]
+                }
+            ]
+        }
+    ],
+    "analysis": "N/A"
+}
\ No newline at end of file
--- a/tools/evalscope-data/reports/report.html
+++ b/tools/evalscope-data/reports/report.html
--- a/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 1.jsonl
+++ b/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 1.jsonl
--- a/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 2.jsonl
+++ b/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 2.jsonl
--- a/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 3.jsonl
+++ b/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 3.jsonl
--- a/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 4.jsonl
+++ b/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 4.jsonl
--- a/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 5.jsonl
+++ b/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 5.jsonl