Commit 2a7c435f authored by sunzhq2's avatar sunzhq2
Browse files

init

parent 59a0ec90
# llm-benchmarks ## MATH-500数据集
- https://www.modelscope.cn/datasets/AI-ModelScope/MATH-500
## vllm 脚本修改
- serve.py 修改
```
mv /usr/local/lib/python3.10/dist-packages/vllm/benchmarks/serve.py /usr/local/lib/python3.10/dist-packages/vllm/benchmarks/serve.py.bak
cp ./utils/vllm-benchmarks/serve.py /usr/local/lib/python3.10/dist-packages/vllm/benchmarks/
```
- datasets.py修改
```
mv /usr/local/lib/python3.10/dist-packages/vllm/benchmarks/datasets.py /usr/local/lib/python3.10/dist-packages/vllm/benchmarks/datasets.py.bak
cp ./utils/vllm-benchmarks/datasets.py /usr/local/lib/python3.10/dist-packages/vllm/benchmarks/
```
## evalscope 修改
- evaluator.py
```
cp /usr/local/lib/python3.10/dist-packages/evalscope/evaluator/evaluator.py /usr/local/lib/python3.10/dist-packages/evalscope/evaluator/evaluator.py.bak
cp ./utils/evalscope/evaluator.py /usr/local/lib/python3.10/dist-packages/evalscope/evaluator/
```
## 启动vllm服务
- bash vllm_serve.sh
## 性能和精度结果保存
- bash run_benchmarks.sh
## 数据集转换
```
cd tools
bash run_convert.sh
```
## 跑精度
```
cd tools
bash evalscope_test.sh
```
\ No newline at end of file
This diff is collapsed.
#!/bin/bash
# run_performance_test.sh - 性能测试并保存详细回答
set -e
if [ ! -d "./results" ]; then
mkdir ./results
fi
rm -rf ./results/*
# ========== 配置 ==========
BASE_URL="http://0.0.0.0:8000"
MODEL_NAME="qwen3-8B"
LOCAL_MODEL_PATH="/data2/models/qwen3-8B"
DATASET_JSONL="./MATH-500/test.jsonl"
PROMPTS_FILE="./results/math500_prompts.jsonl"
RESULT_DIR="./results/performance_results"
RESULT_FILENAME="qwen3_8b_math500_perf.json"
REQUEST_RATE=10
# ===========================
# 1. 检查服务
echo ">>> 检查 vLLM 服务状态..."
if ! curl -s "${BASE_URL}/health" > /dev/null; then
echo ">>> 错误:无法连接到 ${BASE_URL}"
exit 1
fi
echo ">>> 服务正常。"
# 2. 生成 JSONL 提示词文件(包含 sampling_params)
if [ -f "$PROMPTS_FILE" ]; then
echo ">>> 旧的提示词文件已存在,删除并重新生成。"
rm -f "$PROMPTS_FILE"
fi
echo ">>> 正在从 $DATASET_JSONL 生成 JSONL 格式提示词文件..."
python - "$DATASET_JSONL" "$PROMPTS_FILE" << 'EOF'
import sys, json
with open(sys.argv[1], 'r', encoding='utf-8') as f_in, \
open(sys.argv[2], 'w', encoding='utf-8') as f_out:
for line in f_in:
item = json.loads(line)
prompt_text = f"{item['problem']}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}."
record = {
"prompt": prompt_text,
"metadata": {"unique_id": item["unique_id"]}
}
f_out.write(json.dumps(record, ensure_ascii=False) + '\n')
EOF
# "sampling_params": {"max_tokens": 8192},
echo ">>> JSONL 提示词文件已保存至 $PROMPTS_FILE"
# 3. 执行性能测试(带详细保存)
echo ">>> 开始性能压测并保存详细结果..."
vllm bench serve \
--backend vllm \
--base-url "$BASE_URL" \
--model "$MODEL_NAME" \
--tokenizer "$LOCAL_MODEL_PATH" \
--dataset-name custom \
--dataset-path "$PROMPTS_FILE" \
--num-prompts 500 \
--request-rate "$REQUEST_RATE" \
--save-result \
--save-detailed \
--result-dir "$RESULT_DIR" \
--result-filename "$RESULT_FILENAME" \
--custom-output-len 32768 \
--metadata \
--max-concurrency 256 \
--temperature 0.0
# --top_p 0.6
echo ">>> 性能测试完成!结果目录: $RESULT_DIR"
echo ">>> 汇总性能指标: $RESULT_DIR/$RESULT_FILENAME"
\ No newline at end of file
import json
import re
from typing import Dict, Any
WRONG_IDS_FILE = "wrong_ids.txt" # 保存错误样本的 unique_id
def clean_think_tags(text: str) -> str:
"""
移除模型可能生成的 \final ... \final 标签(非贪婪匹配),
以及常见的 <|end▁of▁thinking|>\n 等标签。
"""
# 移除非贪婪匹配的 \final ... \final
text = re.sub(r'\\final.*?\\final', '', text, flags=re.DOTALL)
# 移除 \think ... \think 标签(如果存在)
text = re.sub(r'\\think.*?\\think', '', text, flags=re.DOTALL)
# 移除 <|end▁of▁thinking|>\n... 标签
text = re.sub(r'\\instant.*?(?=\n|$)', '', text, flags=re.DOTALL)
return text.strip()
def extract_boxed_answer(text: str) -> str:
"""
从文本中提取最后一个 \boxed{...} 的内容(更鲁棒的嵌套处理)。
支持多重嵌套花括号。
"""
# 查找所有 \boxed{ 的位置
pattern = r'\\boxed\{'
matches = list(re.finditer(pattern, text))
if not matches:
# 如果没有 boxed,返回最后一行非空文本作为备选
lines = [line.strip() for line in text.split('\n') if line.strip()]
return lines[-1] if lines else text
# 取最后一个 \boxed{ 开始的位置
last_match = matches[-1]
start = last_match.end() - 1 # 指向 '{' 的位置
# 使用栈匹配括号
stack = []
i = start
while i < len(text):
if text[i] == '{':
stack.append('{')
elif text[i] == '}':
stack.pop()
if not stack:
# 找到匹配的右括号
content = text[start+1:i]
return content.strip()
i += 1
# 如果括号不匹配,回退到简单正则
simple_match = re.search(r'\\boxed\{([^}]*(?:\{[^}]*\}[^}]*)*)\}', text)
if simple_match:
return simple_match.group(1).strip()
return text.strip()
def normalize_math_answer(answer: str) -> str:
"""
规范化数学答案,消除 LaTeX、空格、括号格式差异。
注意:此函数对分数、简单表达式有效,但可能不适用于所有符号答案。
"""
# 移除空白
normalized = re.sub(r'\s+', '', answer)
# 去除 \left, \right
normalized = re.sub(r'\\left|\\right', '', normalized)
# 移除 LaTeX 分组花括号(保留可能的结构,但为了比较,可移除非必要的)
# 注意:此操作会改变结构,但对于分数等,比较 \frac1315 与 \frac{13}{15} 是等价的
normalized = normalized.replace('{', '').replace('}', '')
# 统一括号
normalized = normalized.replace('[', '(').replace(']', ')')
# 移除 \displaystyle
normalized = re.sub(r'\\displaystyle', '', normalized)
# 移除末尾的句点
normalized = normalized.rstrip('.')
return normalized.strip()
def compare_answers(predicted: str, ground_truth: str) -> bool:
pred_norm = normalize_math_answer(predicted)
truth_norm = normalize_math_answer(ground_truth)
# 额外处理:如果预测答案包含在标准答案中(例如答案末尾多了标点)
if pred_norm == truth_norm:
return True
# 尝试数值比较(如果都是数字或简单表达式)
try:
# 简单的表达式求值(仅限基本算术)
pred_val = eval(pred_norm.replace('^', '**'), {"__builtins__": None}, {})
truth_val = eval(truth_norm.replace('^', '**'), {"__builtins__": None}, {})
return abs(pred_val - truth_val) < 1e-9
except:
pass
return False
def load_ground_truth_jsonl(filepath: str) -> Dict[str, Dict[str, str]]:
data_dict = {}
with open(filepath, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line:
item = json.loads(line)
unique_id = item.get('unique_id')
if unique_id:
data_dict[unique_id] = item
else:
print(f"警告: 数据集中发现样本缺少 unique_id: {item.get('problem', '')[:50]}...")
return data_dict
def main():
DATASET_JSONL = "/data1/sunzhq/llm-benchmark/MATH-500/test.jsonl"
VLLM_RESULT_JSON = "/data1/sunzhq/llm-benchmark/results-1/performance_results/qwen3_8b_math500_perf.json"
OUTPUT_EVAL_FILE = "evaluation_results.json"
print(f"正在加载数据集: {DATASET_JSONL}")
ground_truth_dict = load_ground_truth_jsonl(DATASET_JSONL)
print(f"数据集索引建立完成,共包含 {len(ground_truth_dict)} 个唯一样本。")
print(f"正在加载 vLLM 结果: {VLLM_RESULT_JSON}")
with open(VLLM_RESULT_JSON, 'r', encoding='utf-8') as f:
vllm_data = json.load(f)
generated_texts = vllm_data.get('generated_texts', [])
dataset_metadata_list = vllm_data.get('dataset_metadata', [])
if len(generated_texts) != len(dataset_metadata_list):
print(f"错误:生成结果数量 ({len(generated_texts)}) 与元数据数量 ({len(dataset_metadata_list)}) 不匹配!")
return
correct_count = 0
eval_results = []
wrong_ids = []
for idx, (raw_output, meta) in enumerate(zip(generated_texts, dataset_metadata_list)):
unique_id = meta.get('unique_id')
if not unique_id:
print(f"警告: 第 {idx} 个样本在 metadata 中没有找到 unique_id。跳过。")
continue
ground_truth_item = ground_truth_dict.get(unique_id)
if not ground_truth_item:
print(f"警告: 在原始数据集中未找到 ID 为 {unique_id} 的样本。跳过。")
continue
ground_truth = ground_truth_item.get('answer', '')
cleaned = clean_think_tags(raw_output)
predicted = extract_boxed_answer(cleaned)
is_correct = compare_answers(predicted, ground_truth)
if is_correct:
correct_count += 1
else:
wrong_ids.append(unique_id)
eval_results.append({
'index': idx,
'unique_id': unique_id,
'problem': ground_truth_item.get('problem', '')[:100] + '...',
'predicted': predicted,
'ground_truth': ground_truth,
'correct': is_correct
})
if (idx + 1) % 10 == 0:
print(f"已处理 {idx+1} 个样本...")
total_valid_samples = len(eval_results)
accuracy = correct_count / total_valid_samples if total_valid_samples > 0 else 0.0
print("\n" + "=" * 60)
print(f"评估完成!")
print(f"参与评估的有效样本数: {total_valid_samples}")
print(f"正确数量: {correct_count}")
print(f"准确率 (Accuracy): {accuracy:.4f} ({accuracy*100:.2f}%)")
# 保存详细结果
output_data = {
'summary': {
'total_processed': total_valid_samples,
'correct': correct_count,
'accuracy': accuracy
},
'details': eval_results
}
with open(OUTPUT_EVAL_FILE, 'w', encoding='utf-8') as f:
json.dump(output_data, f, ensure_ascii=False, indent=2)
print(f"详细评估结果已保存至: {OUTPUT_EVAL_FILE}")
# 保存错误 ID
if wrong_ids:
with open(WRONG_IDS_FILE, 'w', encoding='utf-8') as f:
for uid in wrong_ids:
f.write(uid + '\n')
print(f"错误 unique_id 列表已保存至: {WRONG_IDS_FILE} (共 {len(wrong_ids)} 个)")
else:
print("没有错误样本。")
if __name__ == "__main__":
main()
\ No newline at end of file
import json
import re
from typing import Dict, Any
WRONG_IDS_FILE = "wrong_ids.txt" # 保存错误样本的 unique_id
def clean_think_tags(text: str) -> str:
"""
移除模型可能生成的 \final ... \final 标签(非贪婪匹配),
以及常见的 <|end▁of▁thinking|>\n 等标签。
"""
# 移除非贪婪匹配的 \final ... \final
text = re.sub(r'\\final.*?\\final', '', text, flags=re.DOTALL)
# 移除 \think ... \think 标签(如果存在)
text = re.sub(r'\\think.*?\\think', '', text, flags=re.DOTALL)
# 移除 <|end▁of▁thinking|>\n... 标签
text = re.sub(r'\\instant.*?(?=\n|$)', '', text, flags=re.DOTALL)
return text.strip()
def extract_boxed_answer(text: str) -> str:
"""
从文本中提取最后一个 \boxed{...} 的内容(更鲁棒的嵌套处理)。
支持多重嵌套花括号。
"""
# 查找所有 \boxed{ 的位置
pattern = r'\\boxed\{'
matches = list(re.finditer(pattern, text))
if not matches:
# 如果没有 boxed,返回最后一行非空文本作为备选
lines = [line.strip() for line in text.split('\n') if line.strip()]
return lines[-1] if lines else text
# 取最后一个 \boxed{ 开始的位置
last_match = matches[-1]
start = last_match.end() - 1 # 指向 '{' 的位置
# 使用栈匹配括号
stack = []
i = start
while i < len(text):
if text[i] == '{':
stack.append('{')
elif text[i] == '}':
stack.pop()
if not stack:
# 找到匹配的右括号
content = text[start+1:i]
return content.strip()
i += 1
# 如果括号不匹配,回退到简单正则
simple_match = re.search(r'\\boxed\{([^}]*(?:\{[^}]*\}[^}]*)*)\}', text)
if simple_match:
return simple_match.group(1).strip()
return text.strip()
def normalize_math_answer(answer: str) -> str:
"""
规范化数学答案,消除 LaTeX、空格、括号格式差异。
注意:此函数对分数、简单表达式有效,但可能不适用于所有符号答案。
"""
# 移除空白
normalized = re.sub(r'\s+', '', answer)
# 去除 \left, \right
normalized = re.sub(r'\\left|\\right', '', normalized)
# 移除 LaTeX 分组花括号(保留可能的结构,但为了比较,可移除非必要的)
# 注意:此操作会改变结构,但对于分数等,比较 \frac1315 与 \frac{13}{15} 是等价的
normalized = normalized.replace('{', '').replace('}', '')
# 统一括号
normalized = normalized.replace('[', '(').replace(']', ')')
# 移除 \displaystyle
normalized = re.sub(r'\\displaystyle', '', normalized)
# 移除末尾的句点
normalized = normalized.rstrip('.')
return normalized.strip()
def compare_answers(predicted: str, ground_truth: str) -> bool:
pred_norm = normalize_math_answer(predicted)
truth_norm = normalize_math_answer(ground_truth)
# 额外处理:如果预测答案包含在标准答案中(例如答案末尾多了标点)
if pred_norm == truth_norm:
return True
# 尝试数值比较(如果都是数字或简单表达式)
try:
# 简单的表达式求值(仅限基本算术)
pred_val = eval(pred_norm.replace('^', '**'), {"__builtins__": None}, {})
truth_val = eval(truth_norm.replace('^', '**'), {"__builtins__": None}, {})
return abs(pred_val - truth_val) < 1e-9
except:
pass
return False
def load_ground_truth_jsonl(filepath: str) -> Dict[str, Dict[str, str]]:
data_dict = {}
with open(filepath, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line:
item = json.loads(line)
unique_id = item.get('unique_id')
if unique_id:
data_dict[unique_id] = item
else:
print(f"警告: 数据集中发现样本缺少 unique_id: {item.get('problem', '')[:50]}...")
return data_dict
def main():
DATASET_JSONL = "/data1/sunzhq/llm-benchmark/MATH-500/test.jsonl"
VLLM_RESULT_JSON = "/data1/sunzhq/llm-benchmark/results-1/performance_results/qwen3_8b_math500_perf.json"
OUTPUT_EVAL_FILE = "evaluation_results.json"
print(f"正在加载数据集: {DATASET_JSONL}")
ground_truth_dict = load_ground_truth_jsonl(DATASET_JSONL)
print(f"数据集索引建立完成,共包含 {len(ground_truth_dict)} 个唯一样本。")
print(f"正在加载 vLLM 结果: {VLLM_RESULT_JSON}")
with open(VLLM_RESULT_JSON, 'r', encoding='utf-8') as f:
vllm_data = json.load(f)
generated_texts = vllm_data.get('generated_texts', [])
dataset_metadata_list = vllm_data.get('dataset_metadata', [])
if len(generated_texts) != len(dataset_metadata_list):
print(f"错误:生成结果数量 ({len(generated_texts)}) 与元数据数量 ({len(dataset_metadata_list)}) 不匹配!")
return
correct_count = 0
eval_results = []
wrong_ids = []
for idx, (raw_output, meta) in enumerate(zip(generated_texts, dataset_metadata_list)):
unique_id = meta.get('unique_id')
if not unique_id:
print(f"警告: 第 {idx} 个样本在 metadata 中没有找到 unique_id。跳过。")
continue
ground_truth_item = ground_truth_dict.get(unique_id)
if not ground_truth_item:
print(f"警告: 在原始数据集中未找到 ID 为 {unique_id} 的样本。跳过。")
continue
ground_truth = ground_truth_item.get('answer', '')
cleaned = clean_think_tags(raw_output)
predicted = extract_boxed_answer(cleaned)
is_correct = compare_answers(predicted, ground_truth)
if is_correct:
correct_count += 1
else:
wrong_ids.append(unique_id)
eval_results.append({
'index': idx,
'unique_id': unique_id,
'problem': ground_truth_item.get('problem', '')[:100] + '...',
'predicted': predicted,
'ground_truth': ground_truth,
'correct': is_correct
})
if (idx + 1) % 10 == 0:
print(f"已处理 {idx+1} 个样本...")
total_valid_samples = len(eval_results)
accuracy = correct_count / total_valid_samples if total_valid_samples > 0 else 0.0
print("\n" + "=" * 60)
print(f"评估完成!")
print(f"参与评估的有效样本数: {total_valid_samples}")
print(f"正确数量: {correct_count}")
print(f"准确率 (Accuracy): {accuracy:.4f} ({accuracy*100:.2f}%)")
# 保存详细结果
output_data = {
'summary': {
'total_processed': total_valid_samples,
'correct': correct_count,
'accuracy': accuracy
},
'details': eval_results
}
with open(OUTPUT_EVAL_FILE, 'w', encoding='utf-8') as f:
json.dump(output_data, f, ensure_ascii=False, indent=2)
print(f"详细评估结果已保存至: {OUTPUT_EVAL_FILE}")
# 保存错误 ID
if wrong_ids:
with open(WRONG_IDS_FILE, 'w', encoding='utf-8') as f:
for uid in wrong_ids:
f.write(uid + '\n')
print(f"错误 unique_id 列表已保存至: {WRONG_IDS_FILE} (共 {len(wrong_ids)} 个)")
else:
print("没有错误样本。")
if __name__ == "__main__":
main()
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import re
import sys
import argparse
import os
from collections import defaultdict
def extract_reasoning_and_text(generated_text: str):
"""从 generated_text 中分离 <think> 部分和正文部分"""
pattern = re.compile(r'<think>(.*?)</think>', re.DOTALL)
match = pattern.search(generated_text)
if match:
reasoning = match.group(1).strip()
text_part = pattern.sub('', generated_text).strip()
else:
reasoning = ""
text_part = generated_text.strip()
return reasoning, text_part
def build_output_item(seq_idx: int, unique_id: str, solution: str, generated_text: str):
"""为单个条目构建输出 JSON 对象,seq_idx 为该 Level 内的顺序编号"""
reasoning, answer_text = extract_reasoning_and_text(generated_text)
content = [
{
"internal": None,
"type": "reasoning",
"reasoning": reasoning,
"signature": None,
"redacted": False
},
{
"internal": None,
"type": "text",
"text": answer_text,
"refusal": None
}
]
model_output = {
"model": "qwen3-8B",
"choices": [
{
"message": {
"content": content,
"source": "generate",
"metadata": None,
"internal": None,
"role": "assistant",
"tool_calls": None,
"model": "qwen3-8B"
},
"stop_reason": "stop",
"logprobs": None
}
],
"usage": {
"input_tokens": 109,
"output_tokens": 6708,
"total_tokens": 6817,
"input_tokens_cache_write": None,
"input_tokens_cache_read": None,
"reasoning_tokens": None
},
"time": None,
"metadata": None,
"error": None
}
messages = [
{
"content": content,
"source": "generate",
"metadata": None,
"internal": None,
"role": "assistant",
"tool_calls": None,
"model": "qwen3-8B"
}
]
output = {
"index": seq_idx, # 使用该 Level 内的顺序编号
"model": "qwen3-8B",
"model_output": model_output,
"messages": messages,
"metadata": {
"question_id": unique_id,
"solution": solution
}
}
return output
def main():
parser = argparse.ArgumentParser(
description="将 perf.json 和 test.jsonl 转换为按 level 分组的 jsonl 文件"
)
parser.add_argument(
"--perf",
required=True,
help="输入的 perf.json 文件路径,生成的perf结果文件"
)
parser.add_argument(
"--test",
required=True,
help="输入的 test.jsonl 文件路径,数据集路径"
)
parser.add_argument(
"--output",
default="math_500_Level_{level}.jsonl",
help="输出文件路径模板(必须包含 {level})或输出目录。"
"如果是目录,则会在该目录下生成 math_500_Level_{level}.jsonl 文件。"
"例如:'output/level_{level}.jsonl' 或 'output_dir/'。"
"默认:'math_500_Level_{level}.jsonl'"
)
args = parser.parse_args()
perf_file = args.perf
test_file = args.test
output_pattern = args.output
# 1. 读取 test.jsonl,为每个 Level 内的条目分配顺序编号
# 存储映射:unique_id -> (level, seq_in_level, solution)
id_map = {}
level_counter = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0} # 每个 Level 当前的顺序编号(从0开始)
with open(test_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
obj = json.loads(line)
unique_id = obj.get('unique_id')
solution = obj.get('solution', '')
level = obj.get('level')
if unique_id is None or level not in (1,2,3,4,5):
continue
seq = level_counter[level]
id_map[unique_id] = (level, seq, solution)
level_counter[level] += 1
# 2. 读取 perf.json
with open(perf_file, 'r', encoding='utf-8') as f:
data = json.load(f)
generated_texts = data.get('generated_texts', [])
dataset_metadata = data.get('dataset_metadata', [])
if len(generated_texts) != len(dataset_metadata):
print("Warning: generated_texts and dataset_metadata lengths differ", file=sys.stderr)
# 3. 按 level 分组存储输出对象,使用该 Level 内的顺序编号作为 index
level_outputs = defaultdict(list)
for text, meta in zip(generated_texts, dataset_metadata):
unique_id = meta.get('unique_id', '')
if unique_id not in id_map:
print(f"Warning: unique_id {unique_id} not found in test.jsonl, skipping", file=sys.stderr)
continue
level, seq, solution = id_map[unique_id]
output_item = build_output_item(seq, unique_id, solution, text)
level_outputs[level].append(output_item)
# 4. 处理输出路径:如果不包含 {level},则视为目录(自动创建),使用默认文件名
if '{level}' not in output_pattern:
# 将 output_pattern 视为目录路径
out_dir = output_pattern
# 创建目录(如果不存在)
os.makedirs(out_dir, exist_ok=True)
output_pattern = os.path.join(out_dir, "math_500_Level {level}.jsonl")
# 如果包含 {level},则原样使用,后续也会自动创建目录
# 5. 写入各 level 的 jsonl 文件
for level in range(1, 6):
items = level_outputs.get(level, [])
# 确保按 seq 升序排列
items.sort(key=lambda x: x['index'])
# 生成输出文件路径
out_file = output_pattern.format(level=level)
# 确保输出目录存在
out_dir = os.path.dirname(out_file)
if out_dir:
os.makedirs(out_dir, exist_ok=True)
with open(out_file, 'w', encoding='utf-8') as f:
for obj in items:
f.write(json.dumps(obj, ensure_ascii=False) + '\n')
print(f"Written {len(items)} items to {out_file}")
if __name__ == "__main__":
main()
\ No newline at end of file
analysis_report: false
api_url: null
chat_template: null
dataset_args:
math_500:
aggregation: mean
data_statistics: null
dataset_id: AI-ModelScope/MATH-500
default_subset: default
description: '
## Overview
MATH-500 is a curated subset of 500 problems from the MATH benchmark, designed
to evaluate the mathematical reasoning capabilities of language models. It covers
five difficulty levels across various mathematical topics including algebra,
geometry, number theory, and calculus.
## Task Description
- **Task Type**: Mathematical Problem Solving
- **Input**: Mathematical problem statement
- **Output**: Step-by-step solution with final numerical answer
- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)
## Key Features
- 500 carefully selected problems from the full MATH dataset
- Five difficulty levels for fine-grained evaluation
- Problems cover algebra, geometry, number theory, probability, and more
- Each problem includes a reference solution
- Designed for efficient yet comprehensive math evaluation
## Evaluation Notes
- Default configuration uses **0-shot** evaluation
- Answers should be formatted within `\boxed{}` for proper extraction
- Numeric equivalence checking for answer comparison
- Results can be broken down by difficulty level
- Commonly used for math reasoning benchmarking due to manageable size
'
eval_split: test
extra_params: {}
few_shot_num: 0
few_shot_prompt_template: null
few_shot_random: false
filters: null
force_redownload: false
metric_list:
- acc:
numeric: true
name: math_500
output_types:
- generation
paper_url: null
pretty_name: MATH-500
prompt_template: '{question}
Please reason step by step, and put your final answer within \boxed{{}}.'
query_template: null
review_timeout: null
sample_example: null
sandbox_config: {}
shuffle: false
shuffle_choices: false
subset_list:
- Level 1
- Level 2
- Level 3
- Level 4
- Level 5
system_prompt: null
tags:
- Math
- Reasoning
train_split: null
dataset_dir: /root/.cache/modelscope/hub/datasets
dataset_hub: modelscope
datasets:
- math_500
debug: false
enable_progress_tracker: false
eval_backend: Native
eval_batch_size: 1
eval_config: null
eval_type: mock_llm
evalscope_version: 1.5.2.post1
generation_config:
batch_size: 1
ignore_errors: false
judge_model_args: {}
judge_strategy: auto
judge_worker_num: 1
limit: null
model: text_generation
model_args: {}
model_id: qwen3-8B
model_task: text_generation
no_timestamp: true
repeats: 1
rerun_review: true
sandbox_manager_config: {}
sandbox_type: docker
seed: 42
stream: null
timeout: null
use_cache: /data1/sunzhq/llm-benchmark/tools/evalscope-data
use_sandbox: false
work_dir: /data1/sunzhq/llm-benchmark/tools/evalscope-data
2026-04-14 09:31:50 - evalscope - INFO: Running with native backend
2026-04-14 09:31:50 - evalscope - INFO: Dump task config to /data1/sunzhq/llm-benchmark/tools/evalscope-data/configs/task_config.yaml
2026-04-14 09:31:50 - evalscope - INFO: {
"model": "text_generation",
"model_id": "qwen3-8B",
"model_args": {},
"model_task": "text_generation",
"chat_template": null,
"datasets": [
"math_500"
],
"dataset_args": {
"math_500": {
"name": "math_500",
"dataset_id": "AI-ModelScope/MATH-500",
"output_types": [
"generation"
],
"subset_list": [
"Level 1",
"Level 2",
"Level 3",
"Level 4",
"Level 5"
],
"default_subset": "default",
"few_shot_num": 0,
"few_shot_random": false,
"train_split": null,
"eval_split": "test",
"prompt_template": "{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
"few_shot_prompt_template": null,
"system_prompt": null,
"query_template": null,
"pretty_name": "MATH-500",
"description": "\n## Overview\n\nMATH-500 is a curated subset of 500 problems from the MATH benchmark, designed to evaluate the mathematical reasoning capabilities of language models. It covers five difficulty levels across various mathematical topics including algebra, geometry, number theory, and calculus.\n\n## Task Description\n\n- **Task Type**: Mathematical Problem Solving\n- **Input**: Mathematical problem statement\n- **Output**: Step-by-step solution with final numerical answer\n- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)\n\n## Key Features\n\n- 500 carefully selected problems from the full MATH dataset\n- Five difficulty levels for fine-grained evaluation\n- Problems cover algebra, geometry, number theory, probability, and more\n- Each problem includes a reference solution\n- Designed for efficient yet comprehensive math evaluation\n\n## Evaluation Notes\n\n- Default configuration uses **0-shot** evaluation\n- Answers should be formatted within `\\boxed{}` for proper extraction\n- Numeric equivalence checking for answer comparison\n- Results can be broken down by difficulty level\n- Commonly used for math reasoning benchmarking due to manageable size\n",
"paper_url": null,
"data_statistics": null,
"sample_example": null,
"tags": [
"Math",
"Reasoning"
],
"filters": null,
"metric_list": [
{
"acc": {
"numeric": true
}
}
],
"aggregation": "mean",
"shuffle": false,
"shuffle_choices": false,
"force_redownload": false,
"review_timeout": null,
"extra_params": {},
"sandbox_config": {}
}
},
"dataset_dir": "/root/.cache/modelscope/hub/datasets",
"dataset_hub": "modelscope",
"repeats": 1,
"generation_config": {
"batch_size": 1
},
"eval_type": "mock_llm",
"eval_backend": "Native",
"eval_config": null,
"limit": null,
"eval_batch_size": 1,
"use_cache": "/data1/sunzhq/llm-benchmark/tools/evalscope-data",
"rerun_review": true,
"work_dir": "/data1/sunzhq/llm-benchmark/tools/evalscope-data",
"no_timestamp": true,
"enable_progress_tracker": false,
"ignore_errors": false,
"debug": false,
"seed": 42,
"api_url": null,
"timeout": null,
"stream": null,
"judge_strategy": "auto",
"judge_worker_num": 1,
"judge_model_args": {},
"analysis_report": false,
"use_sandbox": false,
"sandbox_type": "docker",
"sandbox_manager_config": {},
"evalscope_version": "1.5.2.post1"
}
2026-04-14 09:31:50 - evalscope - INFO: Start loading benchmark dataset: math_500
2026-04-14 09:31:50 - evalscope - INFO: Start evaluating 5 subsets of math_500: ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
2026-04-14 09:31:50 - evalscope - INFO: Reusing predictions from /data1/sunzhq/llm-benchmark/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 1.jsonl, got 43 predictions, remaining 43 samples
2026-04-14 09:31:50 - evalscope - WARNING: [Rerun review mode] Skipping 43 samples in subset 'Level 1' due to missing cached predictions. They will NOT be inferred.
2026-04-14 09:31:50 - evalscope - INFO: Reusing predictions from /data1/sunzhq/llm-benchmark/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 2.jsonl, got 90 predictions, remaining 90 samples
2026-04-14 09:31:50 - evalscope - WARNING: [Rerun review mode] Skipping 90 samples in subset 'Level 2' due to missing cached predictions. They will NOT be inferred.
2026-04-14 09:31:50 - evalscope - INFO: Reusing predictions from /data1/sunzhq/llm-benchmark/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 3.jsonl, got 105 predictions, remaining 105 samples
2026-04-14 09:31:50 - evalscope - WARNING: [Rerun review mode] Skipping 105 samples in subset 'Level 3' due to missing cached predictions. They will NOT be inferred.
2026-04-14 09:31:50 - evalscope - INFO: Reusing predictions from /data1/sunzhq/llm-benchmark/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 4.jsonl, got 128 predictions, remaining 128 samples
2026-04-14 09:31:50 - evalscope - WARNING: [Rerun review mode] Skipping 128 samples in subset 'Level 4' due to missing cached predictions. They will NOT be inferred.
2026-04-14 09:31:50 - evalscope - INFO: Reusing predictions from /data1/sunzhq/llm-benchmark/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 5.jsonl, got 134 predictions, remaining 134 samples
2026-04-14 09:31:50 - evalscope - WARNING: [Rerun review mode] Skipping 134 samples in subset 'Level 5' due to missing cached predictions. They will NOT be inferred.
2026-04-14 09:31:50 - evalscope - INFO: Unified pool: 500 items to process, 0 already fully cached (500 total across all subsets).
2026-04-14 09:31:52 - evalscope - INFO: Evaluating[math_500] 100%| 500/500 [Elapsed: 00:02 < Remaining: 00:00, 83.48it/s]
2026-04-14 09:31:52 - evalscope - INFO: Unified pool finished for math_500.
2026-04-14 09:31:52 - evalscope - INFO: Aggregating scores for subset: Level 1
2026-04-14 09:31:52 - evalscope - INFO: Aggregating scores for subset: Level 2
2026-04-14 09:31:52 - evalscope - INFO: Aggregating scores for subset: Level 3
2026-04-14 09:31:52 - evalscope - INFO: Aggregating scores for subset: Level 4
2026-04-14 09:31:52 - evalscope - INFO: Aggregating scores for subset: Level 5
2026-04-14 09:31:52 - evalscope - INFO: Generating report...
2026-04-14 09:31:52 - evalscope - INFO:
math_500 report table:
+----------+-----------+----------+----------+-------+---------+---------+
| Model | Dataset | Metric | Subset | Num | Score | Cat.0 |
+==========+===========+==========+==========+=======+=========+=========+
| qwen3-8B | math_500 | mean_acc | Level 1 | 43 | 0.9535 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 2 | 90 | 0.9889 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 3 | 105 | 0.9524 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 4 | 128 | 0.9531 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 5 | 134 | 0.8881 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | OVERALL | 500 | 0.942 | - |
+----------+-----------+----------+----------+-------+---------+---------+
2026-04-14 09:31:52 - evalscope - INFO: Skipping report analysis (`analysis_report=False`).
2026-04-14 09:31:52 - evalscope - INFO: Dump report to: /data1/sunzhq/llm-benchmark/tools/evalscope-data/reports/qwen3-8B/math_500.json
2026-04-14 09:31:52 - evalscope - INFO: Benchmark math_500 evaluation finished.
2026-04-14 09:31:52 - evalscope - INFO: Running[eval] 100%| 1/1 [Elapsed: 00:02 < Remaining: 00:00, 2.69s/benchmark]
2026-04-14 09:31:52 - evalscope - INFO: Overall report table:
+----------+-----------+----------+----------+-------+---------+---------+
| Model | Dataset | Metric | Subset | Num | Score | Cat.0 |
+==========+===========+==========+==========+=======+=========+=========+
| qwen3-8B | math_500 | mean_acc | Level 1 | 43 | 0.9535 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 2 | 90 | 0.9889 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 3 | 105 | 0.9524 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 4 | 128 | 0.9531 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 5 | 134 | 0.8881 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | OVERALL | 500 | 0.942 | - |
+----------+-----------+----------+----------+-------+---------+---------+
2026-04-14 09:31:53 - evalscope - INFO: HTML report generated: /data1/sunzhq/llm-benchmark/tools/evalscope-data/reports/report.html
2026-04-14 09:31:53 - evalscope - INFO: Finished evaluation for qwen3-8B on ['math_500']
2026-04-14 09:31:53 - evalscope - INFO: Output directory: /data1/sunzhq/llm-benchmark/tools/evalscope-data
{
"name": "qwen3-8B@math_500",
"dataset_name": "math_500",
"dataset_pretty_name": "MATH-500",
"dataset_description": "\n## Overview\n\nMATH-500 is a curated subset of 500 problems from the MATH benchmark, designed to evaluate the mathematical reasoning capabilities of language models. It covers five difficulty levels across various mathematical topics including algebra, geometry, number theory, and calculus.\n\n## Task Description\n\n- **Task Type**: Mathematical Problem Solving\n- **Input**: Mathematical problem statement\n- **Output**: Step-by-step solution with final numerical answer\n- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)\n\n## Key Features\n\n- 500 carefully selected problems from the full MATH dataset\n- Five difficulty levels for fine-grained evaluation\n- Problems cover algebra, geometry, number theory, probability, and more\n- Each problem includes a reference solution\n- Designed for efficient yet comprehensive math evaluation\n\n## Evaluation Notes\n\n- Default configuration uses **0-shot** evaluation\n- Answers should be formatted within `\\boxed{}` for proper extraction\n- Numeric equivalence checking for answer comparison\n- Results can be broken down by difficulty level\n- Commonly used for math reasoning benchmarking due to manageable size\n",
"model_name": "qwen3-8B",
"score": 0.942,
"metrics": [
{
"name": "mean_acc",
"num": 500,
"score": 0.942,
"macro_score": 0.942,
"categories": [
{
"name": [
"default"
],
"num": 500,
"score": 0.942,
"macro_score": 0.9472,
"subsets": [
{
"name": "Level 1",
"score": 0.9535,
"num": 43
},
{
"name": "Level 2",
"score": 0.9889,
"num": 90
},
{
"name": "Level 3",
"score": 0.9524,
"num": 105
},
{
"name": "Level 4",
"score": 0.9531,
"num": 128
},
{
"name": "Level 5",
"score": 0.8881,
"num": 134
}
]
}
]
}
],
"analysis": "N/A"
}
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment