Commit 8b2e8ec0 authored by sunzhq2's avatar sunzhq2
Browse files

init evalscope

parent 2a7c435f
...@@ -4,21 +4,21 @@ ...@@ -4,21 +4,21 @@
"dataset_pretty_name": "MATH-500", "dataset_pretty_name": "MATH-500",
"dataset_description": "\n## Overview\n\nMATH-500 is a curated subset of 500 problems from the MATH benchmark, designed to evaluate the mathematical reasoning capabilities of language models. It covers five difficulty levels across various mathematical topics including algebra, geometry, number theory, and calculus.\n\n## Task Description\n\n- **Task Type**: Mathematical Problem Solving\n- **Input**: Mathematical problem statement\n- **Output**: Step-by-step solution with final numerical answer\n- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)\n\n## Key Features\n\n- 500 carefully selected problems from the full MATH dataset\n- Five difficulty levels for fine-grained evaluation\n- Problems cover algebra, geometry, number theory, probability, and more\n- Each problem includes a reference solution\n- Designed for efficient yet comprehensive math evaluation\n\n## Evaluation Notes\n\n- Default configuration uses **0-shot** evaluation\n- Answers should be formatted within `\\boxed{}` for proper extraction\n- Numeric equivalence checking for answer comparison\n- Results can be broken down by difficulty level\n- Commonly used for math reasoning benchmarking due to manageable size\n", "dataset_description": "\n## Overview\n\nMATH-500 is a curated subset of 500 problems from the MATH benchmark, designed to evaluate the mathematical reasoning capabilities of language models. It covers five difficulty levels across various mathematical topics including algebra, geometry, number theory, and calculus.\n\n## Task Description\n\n- **Task Type**: Mathematical Problem Solving\n- **Input**: Mathematical problem statement\n- **Output**: Step-by-step solution with final numerical answer\n- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)\n\n## Key Features\n\n- 500 carefully selected problems from the full MATH dataset\n- Five difficulty levels for fine-grained evaluation\n- Problems cover algebra, geometry, number theory, probability, and more\n- Each problem includes a reference solution\n- Designed for efficient yet comprehensive math evaluation\n\n## Evaluation Notes\n\n- Default configuration uses **0-shot** evaluation\n- Answers should be formatted within `\\boxed{}` for proper extraction\n- Numeric equivalence checking for answer comparison\n- Results can be broken down by difficulty level\n- Commonly used for math reasoning benchmarking due to manageable size\n",
"model_name": "qwen3-8B", "model_name": "qwen3-8B",
"score": 0.942, "score": 0.938,
"metrics": [ "metrics": [
{ {
"name": "mean_acc", "name": "mean_acc",
"num": 500, "num": 500,
"score": 0.942, "score": 0.938,
"macro_score": 0.942, "macro_score": 0.938,
"categories": [ "categories": [
{ {
"name": [ "name": [
"default" "default"
], ],
"num": 500, "num": 500,
"score": 0.942, "score": 0.938,
"macro_score": 0.9472, "macro_score": 0.9434,
"subsets": [ "subsets": [
{ {
"name": "Level 1", "name": "Level 1",
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
}, },
{ {
"name": "Level 2", "name": "Level 2",
"score": 0.9889, "score": 0.9778,
"num": 90 "num": 90
}, },
{ {
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
}, },
{ {
"name": "Level 4", "name": "Level 4",
"score": 0.9531, "score": 0.9453,
"num": 128 "num": 128
}, },
{ {
......
This diff is collapsed.
...@@ -6,18 +6,18 @@ ...@@ -6,18 +6,18 @@
# --api-url 'http://0.0.0.0:8000/v1/chat/completions' \ # --api-url 'http://0.0.0.0:8000/v1/chat/completions' \
# --api-key 'EMPTY' \ # --api-key 'EMPTY' \
# --datasets 'math_500' \ # --datasets 'math_500' \
# --dataset-args '{"math_500": {"local_path": "/data1/sunzhq/llm-benchmark/MATH-500"}}' \ # --dataset-args '{"math_500": {"local_path": "../MATH-500", "subset_list": ["Level 1"]}}' \
# --eval-batch-size 32 \ # --eval-batch-size 32 \
# --generation-config '{"batch_size": 32, "temperature": 0.0}' \ # --generation-config '{"batch_size": 32, "temperature": 0.0}' \
# --timeout 1800 \ # --timeout 1800
evalscope eval \ evalscope eval \
--use-cache /data1/sunzhq/llm-benchmark/tools/evalscope-data \ --use-cache ./evalscope-data-001 \
--datasets math_500 \ --datasets math_500 \
--model-id qwen3-8B \ --model-id qwen3-8B \
--no-timestamp \ --no-timestamp \
--rerun-review --rerun-review
# --dataset-args '{"math_500": {"local_path": "/data1/sunzhq/llm-benchmark/MATH-500", "subset_list": ["Level 2"]}}' \ # --dataset-args '{"math_500": {"local_path": "../MATH-500", "subset_list": ["Level 2"]}}' \
\ No newline at end of file \ No newline at end of file
analysis_report: false
api_url: null
chat_template: null
dataset_args:
math_500:
aggregation: mean
data_statistics: null
dataset_id: AI-ModelScope/MATH-500
default_subset: default
description: '
## Overview
MATH-500 is a curated subset of 500 problems from the MATH benchmark, designed
to evaluate the mathematical reasoning capabilities of language models. It covers
five difficulty levels across various mathematical topics including algebra,
geometry, number theory, and calculus.
## Task Description
- **Task Type**: Mathematical Problem Solving
- **Input**: Mathematical problem statement
- **Output**: Step-by-step solution with final numerical answer
- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)
## Key Features
- 500 carefully selected problems from the full MATH dataset
- Five difficulty levels for fine-grained evaluation
- Problems cover algebra, geometry, number theory, probability, and more
- Each problem includes a reference solution
- Designed for efficient yet comprehensive math evaluation
## Evaluation Notes
- Default configuration uses **0-shot** evaluation
- Answers should be formatted within `\boxed{}` for proper extraction
- Numeric equivalence checking for answer comparison
- Results can be broken down by difficulty level
- Commonly used for math reasoning benchmarking due to manageable size
'
eval_split: test
extra_params: {}
few_shot_num: 0
few_shot_prompt_template: null
few_shot_random: false
filters: null
force_redownload: false
metric_list:
- acc:
numeric: true
name: math_500
output_types:
- generation
paper_url: null
pretty_name: MATH-500
prompt_template: '{question}
Please reason step by step, and put your final answer within \boxed{{}}.'
query_template: null
review_timeout: null
sample_example: null
sandbox_config: {}
shuffle: false
shuffle_choices: false
subset_list:
- Level 1
- Level 2
- Level 3
- Level 4
- Level 5
system_prompt: null
tags:
- Math
- Reasoning
train_split: null
dataset_dir: /root/.cache/modelscope/hub/datasets
dataset_hub: modelscope
datasets:
- math_500
debug: false
enable_progress_tracker: false
eval_backend: Native
eval_batch_size: 1
eval_config: null
eval_type: mock_llm
evalscope_version: 1.5.2.post1
generation_config:
batch_size: 1
ignore_errors: false
judge_model_args: {}
judge_strategy: auto
judge_worker_num: 1
limit: null
model: text_generation
model_args: {}
model_id: qwen3-8B
model_task: text_generation
no_timestamp: true
repeats: 1
rerun_review: true
sandbox_manager_config: {}
sandbox_type: docker
seed: 42
stream: null
timeout: null
use_cache: ./outputs/20260409_180006
use_sandbox: false
work_dir: ./outputs/20260409_180006
2026-04-14 14:01:14 - evalscope - INFO: Running with native backend
2026-04-14 14:01:14 - evalscope - INFO: Dump task config to ./outputs/20260409_180006/configs/task_config.yaml
2026-04-14 14:01:14 - evalscope - INFO: {
"model": "text_generation",
"model_id": "qwen3-8B",
"model_args": {},
"model_task": "text_generation",
"chat_template": null,
"datasets": [
"math_500"
],
"dataset_args": {
"math_500": {
"name": "math_500",
"dataset_id": "AI-ModelScope/MATH-500",
"output_types": [
"generation"
],
"subset_list": [
"Level 1",
"Level 2",
"Level 3",
"Level 4",
"Level 5"
],
"default_subset": "default",
"few_shot_num": 0,
"few_shot_random": false,
"train_split": null,
"eval_split": "test",
"prompt_template": "{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
"few_shot_prompt_template": null,
"system_prompt": null,
"query_template": null,
"pretty_name": "MATH-500",
"description": "\n## Overview\n\nMATH-500 is a curated subset of 500 problems from the MATH benchmark, designed to evaluate the mathematical reasoning capabilities of language models. It covers five difficulty levels across various mathematical topics including algebra, geometry, number theory, and calculus.\n\n## Task Description\n\n- **Task Type**: Mathematical Problem Solving\n- **Input**: Mathematical problem statement\n- **Output**: Step-by-step solution with final numerical answer\n- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)\n\n## Key Features\n\n- 500 carefully selected problems from the full MATH dataset\n- Five difficulty levels for fine-grained evaluation\n- Problems cover algebra, geometry, number theory, probability, and more\n- Each problem includes a reference solution\n- Designed for efficient yet comprehensive math evaluation\n\n## Evaluation Notes\n\n- Default configuration uses **0-shot** evaluation\n- Answers should be formatted within `\\boxed{}` for proper extraction\n- Numeric equivalence checking for answer comparison\n- Results can be broken down by difficulty level\n- Commonly used for math reasoning benchmarking due to manageable size\n",
"paper_url": null,
"data_statistics": null,
"sample_example": null,
"tags": [
"Math",
"Reasoning"
],
"filters": null,
"metric_list": [
{
"acc": {
"numeric": true
}
}
],
"aggregation": "mean",
"shuffle": false,
"shuffle_choices": false,
"force_redownload": false,
"review_timeout": null,
"extra_params": {},
"sandbox_config": {}
}
},
"dataset_dir": "/root/.cache/modelscope/hub/datasets",
"dataset_hub": "modelscope",
"repeats": 1,
"generation_config": {
"batch_size": 1
},
"eval_type": "mock_llm",
"eval_backend": "Native",
"eval_config": null,
"limit": null,
"eval_batch_size": 1,
"use_cache": "./outputs/20260409_180006",
"rerun_review": true,
"work_dir": "./outputs/20260409_180006",
"no_timestamp": true,
"enable_progress_tracker": false,
"ignore_errors": false,
"debug": false,
"seed": 42,
"api_url": null,
"timeout": null,
"stream": null,
"judge_strategy": "auto",
"judge_worker_num": 1,
"judge_model_args": {},
"analysis_report": false,
"use_sandbox": false,
"sandbox_type": "docker",
"sandbox_manager_config": {},
"evalscope_version": "1.5.2.post1"
}
2026-04-14 14:01:14 - evalscope - INFO: Start loading benchmark dataset: math_500
2026-04-14 14:01:14 - evalscope - INFO: Start evaluating 5 subsets of math_500: ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
2026-04-14 14:01:14 - evalscope - INFO: Reusing predictions from ./outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 1.jsonl, got 43 predictions, remaining 43 samples
2026-04-14 14:01:14 - evalscope - WARNING: [Rerun review mode] Skipping 43 samples in subset 'Level 1' due to missing cached predictions. They will NOT be inferred.
2026-04-14 14:01:14 - evalscope - INFO: Reusing predictions from ./outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 2.jsonl, got 90 predictions, remaining 90 samples
2026-04-14 14:01:14 - evalscope - WARNING: [Rerun review mode] Skipping 90 samples in subset 'Level 2' due to missing cached predictions. They will NOT be inferred.
2026-04-14 14:01:14 - evalscope - INFO: Reusing predictions from ./outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 3.jsonl, got 105 predictions, remaining 105 samples
2026-04-14 14:01:14 - evalscope - WARNING: [Rerun review mode] Skipping 105 samples in subset 'Level 3' due to missing cached predictions. They will NOT be inferred.
2026-04-14 14:01:14 - evalscope - INFO: Reusing predictions from ./outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 4.jsonl, got 128 predictions, remaining 128 samples
2026-04-14 14:01:14 - evalscope - WARNING: [Rerun review mode] Skipping 128 samples in subset 'Level 4' due to missing cached predictions. They will NOT be inferred.
2026-04-14 14:01:14 - evalscope - INFO: Reusing predictions from ./outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 5.jsonl, got 134 predictions, remaining 134 samples
2026-04-14 14:01:14 - evalscope - WARNING: [Rerun review mode] Skipping 134 samples in subset 'Level 5' due to missing cached predictions. They will NOT be inferred.
2026-04-14 14:01:14 - evalscope - INFO: Unified pool: 500 items to process, 0 already fully cached (500 total across all subsets).
2026-04-14 14:01:16 - evalscope - INFO: Evaluating[math_500] 100%| 500/500 [Elapsed: 00:02 < Remaining: 00:00, 70.93it/s]
2026-04-14 14:01:16 - evalscope - INFO: Unified pool finished for math_500.
2026-04-14 14:01:16 - evalscope - INFO: Aggregating scores for subset: Level 1
2026-04-14 14:01:16 - evalscope - INFO: Aggregating scores for subset: Level 2
2026-04-14 14:01:16 - evalscope - INFO: Aggregating scores for subset: Level 3
2026-04-14 14:01:16 - evalscope - INFO: Aggregating scores for subset: Level 4
2026-04-14 14:01:16 - evalscope - INFO: Aggregating scores for subset: Level 5
2026-04-14 14:01:16 - evalscope - INFO: Generating report...
2026-04-14 14:01:16 - evalscope - INFO:
math_500 report table:
+----------+-----------+----------+----------+-------+---------+---------+
| Model | Dataset | Metric | Subset | Num | Score | Cat.0 |
+==========+===========+==========+==========+=======+=========+=========+
| qwen3-8B | math_500 | mean_acc | Level 1 | 43 | 0.9767 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 2 | 90 | 0.9778 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 3 | 105 | 0.9714 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 4 | 128 | 0.9297 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 5 | 134 | 0.9478 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | OVERALL | 500 | 0.956 | - |
+----------+-----------+----------+----------+-------+---------+---------+
2026-04-14 14:01:16 - evalscope - INFO: Skipping report analysis (`analysis_report=False`).
2026-04-14 14:01:16 - evalscope - INFO: Dump report to: ./outputs/20260409_180006/reports/qwen3-8B/math_500.json
2026-04-14 14:01:16 - evalscope - INFO: Benchmark math_500 evaluation finished.
2026-04-14 14:01:16 - evalscope - INFO: Running[eval] 100%| 1/1 [Elapsed: 00:02 < Remaining: 00:00, 2.67s/benchmark]
2026-04-14 14:01:16 - evalscope - INFO: Overall report table:
+----------+-----------+----------+----------+-------+---------+---------+
| Model | Dataset | Metric | Subset | Num | Score | Cat.0 |
+==========+===========+==========+==========+=======+=========+=========+
| qwen3-8B | math_500 | mean_acc | Level 1 | 43 | 0.9767 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 2 | 90 | 0.9778 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 3 | 105 | 0.9714 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 4 | 128 | 0.9297 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | Level 5 | 134 | 0.9478 | default |
+----------+-----------+----------+----------+-------+---------+---------+
| qwen3-8B | math_500 | mean_acc | OVERALL | 500 | 0.956 | - |
+----------+-----------+----------+----------+-------+---------+---------+
2026-04-14 14:01:17 - evalscope - INFO: HTML report generated: /data1/sunzhq/tmp/llm-benchmarks/tools/outputs/20260409_180006/reports/report.html
2026-04-14 14:01:17 - evalscope - INFO: Finished evaluation for qwen3-8B on ['math_500']
2026-04-14 14:01:17 - evalscope - INFO: Output directory: ./outputs/20260409_180006
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment