init evalscope

8b2e8ec0 · sunzhq2 · 2a7c435f · 8b2e8ec0 · 8b2e8ec0 · 8b2e8ec0
Commit 8b2e8ec0 authored Apr 16, 2026 by sunzhq2
20 changed files
--- a/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 1.jsonl
+++ b/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 1.jsonl
--- a/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 2.jsonl
+++ b/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 2.jsonl
--- a/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 3.jsonl
+++ b/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 3.jsonl
--- a/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 4.jsonl
+++ b/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 4.jsonl
--- a/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 5.jsonl
+++ b/tools/evalscope-data/predictions/qwen3-8B/math_500_Level 5.jsonl
--- a/tools/evalscope-data/reports/qwen3-8B/math_500.json
+++ b/tools/evalscope-data/reports/qwen3-8B/math_500.json
@@ -4,21 +4,21 @@
    "dataset_pretty_name": "MATH-500",
    "dataset_description": "\n## Overview\n\nMATH-500 is a curated subset of 500 problems from the MATH benchmark, designed to evaluate the mathematical reasoning capabilities of language models. It covers five difficulty levels across various mathematical topics including algebra, geometry, number theory, and calculus.\n\n## Task Description\n\n- **Task Type**: Mathematical Problem Solving\n- **Input**: Mathematical problem statement\n- **Output**: Step-by-step solution with final numerical answer\n- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)\n\n## Key Features\n\n- 500 carefully selected problems from the full MATH dataset\n- Five difficulty levels for fine-grained evaluation\n- Problems cover algebra, geometry, number theory, probability, and more\n- Each problem includes a reference solution\n- Designed for efficient yet comprehensive math evaluation\n\n## Evaluation Notes\n\n- Default configuration uses **0-shot** evaluation\n- Answers should be formatted within `\\boxed{}` for proper extraction\n- Numeric equivalence checking for answer comparison\n- Results can be broken down by difficulty level\n- Commonly used for math reasoning benchmarking due to manageable size\n",
    "model_name": "qwen3-8B",
-    "score": 0.942,
+    "score": 0.938,
    "metrics": [
        {
            "name": "mean_acc",
            "num": 500,
-            "score": 0.942,
+            "score": 0.938,
-            "macro_score": 0.942,
+            "macro_score": 0.938,
            "categories": [
                {
                    "name": [
                        "default"
                    ],
                    "num": 500,
-                    "score": 0.942,
+                    "score": 0.938,
-                    "macro_score": 0.9472,
+                    "macro_score": 0.9434,
                    "subsets": [
                        {
                            "name": "Level 1",
@@ -27,7 +27,7 @@
                        },
                        {
                            "name": "Level 2",
-                            "score": 0.9889,
+                            "score": 0.9778,
                            "num": 90
                        },
                        {
@@ -37,7 +37,7 @@
                        },
                        {
                            "name": "Level 4",
-                            "score": 0.9531,
+                            "score": 0.9453,
                            "num": 128
                        },
                        {

--- a/tools/evalscope-data/reports/report.html
+++ b/tools/evalscope-data/reports/report.html
--- a/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 1.jsonl
+++ b/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 1.jsonl
--- a/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 2.jsonl
+++ b/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 2.jsonl
--- a/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 3.jsonl
+++ b/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 3.jsonl
--- a/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 4.jsonl
+++ b/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 4.jsonl
--- a/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 5.jsonl
+++ b/tools/evalscope-data/reviews/qwen3-8B/math_500_Level 5.jsonl
--- a/tools/evalscope_test.sh
+++ b/tools/evalscope_test.sh
@@ -6,18 +6,18 @@
 #     --api-url 'http://0.0.0.0:8000/v1/chat/completions' \
 #     --api-key 'EMPTY' \
 #     --datasets 'math_500' \
-#     --dataset-args '{"math_500": {"local_path": "/data1/sunzhq/llm-benchmark/MATH-500"}}' \
+#     --dataset-args '{"math_500": {"local_path": "../MATH-500", "subset_list": ["Level 1"]}}' \
 #     --eval-batch-size 32 \
 #     --generation-config '{"batch_size": 32, "temperature": 0.0}' \
-#     --timeout 1800 \
+#     --timeout 1800
 evalscope eval \
-  --use-cache /data1/sunzhq/llm-benchmark/tools/evalscope-data \
+  --use-cache ./evalscope-data-001 \
  --datasets math_500 \
  --model-id qwen3-8B \
  --no-timestamp \
  --rerun-review 
-# --dataset-args '{"math_500": {"local_path": "/data1/sunzhq/llm-benchmark/MATH-500", "subset_list": ["Level 2"]}}' \
+# --dataset-args '{"math_500": {"local_path": "../MATH-500", "subset_list": ["Level 2"]}}' \
\ No newline at end of file
--- a/tools/outputs/20260409_180006/configs/task_config.yaml
+++ b/tools/outputs/20260409_180006/configs/task_config.yaml
+analysis_report: false
+api_url: null
+chat_template: null
+dataset_args:
+  math_500:
+    aggregation: mean
+    data_statistics: null
+    dataset_id: AI-ModelScope/MATH-500
+    default_subset: default
+    description: '
+      ## Overview
+      MATH-500 is a curated subset of 500 problems from the MATH benchmark, designed
+      to evaluate the mathematical reasoning capabilities of language models. It covers
+      five difficulty levels across various mathematical topics including algebra,
+      geometry, number theory, and calculus.
+      ## Task Description
+      - **Task Type**: Mathematical Problem Solving
+      - **Input**: Mathematical problem statement
+      - **Output**: Step-by-step solution with final numerical answer
+      - **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)
+      ## Key Features
+      - 500 carefully selected problems from the full MATH dataset
+      - Five difficulty levels for fine-grained evaluation
+      - Problems cover algebra, geometry, number theory, probability, and more
+      - Each problem includes a reference solution
+      - Designed for efficient yet comprehensive math evaluation
+      ## Evaluation Notes
+      - Default configuration uses **0-shot** evaluation
+      - Answers should be formatted within `\boxed{}` for proper extraction
+      - Numeric equivalence checking for answer comparison
+      - Results can be broken down by difficulty level
+      - Commonly used for math reasoning benchmarking due to manageable size
+      '
+    eval_split: test
+    extra_params: {}
+    few_shot_num: 0
+    few_shot_prompt_template: null
+    few_shot_random: false
+    filters: null
+    force_redownload: false
+    metric_list:
+    - acc:
+        numeric: true
+    name: math_500
+    output_types:
+    - generation
+    paper_url: null
+    pretty_name: MATH-500
+    prompt_template: '{question}
+      Please reason step by step, and put your final answer within \boxed{{}}.'
+    query_template: null
+    review_timeout: null
+    sample_example: null
+    sandbox_config: {}
+    shuffle: false
+    shuffle_choices: false
+    subset_list:
+    - Level 1
+    - Level 2
+    - Level 3
+    - Level 4
+    - Level 5
+    system_prompt: null
+    tags:
+    - Math
+    - Reasoning
+    train_split: null
+dataset_dir: /root/.cache/modelscope/hub/datasets
+dataset_hub: modelscope
+datasets:
+- math_500
+debug: false
+enable_progress_tracker: false
+eval_backend: Native
+eval_batch_size: 1
+eval_config: null
+eval_type: mock_llm
+evalscope_version: 1.5.2.post1
+generation_config:
+  batch_size: 1
+ignore_errors: false
+judge_model_args: {}
+judge_strategy: auto
+judge_worker_num: 1
+limit: null
+model: text_generation
+model_args: {}
+model_id: qwen3-8B
+model_task: text_generation
+no_timestamp: true
+repeats: 1
+rerun_review: true
+sandbox_manager_config: {}
+sandbox_type: docker
+seed: 42
+stream: null
+timeout: null
+use_cache: ./outputs/20260409_180006
+use_sandbox: false
+work_dir: ./outputs/20260409_180006
--- a/tools/outputs/20260409_180006/logs/eval_log.log
+++ b/tools/outputs/20260409_180006/logs/eval_log.log
+2026-04-14 14:01:14 - evalscope - INFO: Running with native backend
+2026-04-14 14:01:14 - evalscope - INFO: Dump task config to ./outputs/20260409_180006/configs/task_config.yaml
+2026-04-14 14:01:14 - evalscope - INFO: {
+    "model": "text_generation",
+    "model_id": "qwen3-8B",
+    "model_args": {},
+    "model_task": "text_generation",
+    "chat_template": null,
+    "datasets": [
+        "math_500"
+    ],
+    "dataset_args": {
+        "math_500": {
+            "name": "math_500",
+            "dataset_id": "AI-ModelScope/MATH-500",
+            "output_types": [
+                "generation"
+            ],
+            "subset_list": [
+                "Level 1",
+                "Level 2",
+                "Level 3",
+                "Level 4",
+                "Level 5"
+            ],
+            "default_subset": "default",
+            "few_shot_num": 0,
+            "few_shot_random": false,
+            "train_split": null,
+            "eval_split": "test",
+            "prompt_template": "{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
+            "few_shot_prompt_template": null,
+            "system_prompt": null,
+            "query_template": null,
+            "pretty_name": "MATH-500",
+            "description": "\n## Overview\n\nMATH-500 is a curated subset of 500 problems from the MATH benchmark, designed to evaluate the mathematical reasoning capabilities of language models. It covers five difficulty levels across various mathematical topics including algebra, geometry, number theory, and calculus.\n\n## Task Description\n\n- **Task Type**: Mathematical Problem Solving\n- **Input**: Mathematical problem statement\n- **Output**: Step-by-step solution with final numerical answer\n- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)\n\n## Key Features\n\n- 500 carefully selected problems from the full MATH dataset\n- Five difficulty levels for fine-grained evaluation\n- Problems cover algebra, geometry, number theory, probability, and more\n- Each problem includes a reference solution\n- Designed for efficient yet comprehensive math evaluation\n\n## Evaluation Notes\n\n- Default configuration uses **0-shot** evaluation\n- Answers should be formatted within `\\boxed{}` for proper extraction\n- Numeric equivalence checking for answer comparison\n- Results can be broken down by difficulty level\n- Commonly used for math reasoning benchmarking due to manageable size\n",
+            "paper_url": null,
+            "data_statistics": null,
+            "sample_example": null,
+            "tags": [
+                "Math",
+                "Reasoning"
+            ],
+            "filters": null,
+            "metric_list": [
+                {
+                    "acc": {
+                        "numeric": true
+                    }
+                }
+            ],
+            "aggregation": "mean",
+            "shuffle": false,
+            "shuffle_choices": false,
+            "force_redownload": false,
+            "review_timeout": null,
+            "extra_params": {},
+            "sandbox_config": {}
+        }
+    },
+    "dataset_dir": "/root/.cache/modelscope/hub/datasets",
+    "dataset_hub": "modelscope",
+    "repeats": 1,
+    "generation_config": {
+        "batch_size": 1
+    },
+    "eval_type": "mock_llm",
+    "eval_backend": "Native",
+    "eval_config": null,
+    "limit": null,
+    "eval_batch_size": 1,
+    "use_cache": "./outputs/20260409_180006",
+    "rerun_review": true,
+    "work_dir": "./outputs/20260409_180006",
+    "no_timestamp": true,
+    "enable_progress_tracker": false,
+    "ignore_errors": false,
+    "debug": false,
+    "seed": 42,
+    "api_url": null,
+    "timeout": null,
+    "stream": null,
+    "judge_strategy": "auto",
+    "judge_worker_num": 1,
+    "judge_model_args": {},
+    "analysis_report": false,
+    "use_sandbox": false,
+    "sandbox_type": "docker",
+    "sandbox_manager_config": {},
+    "evalscope_version": "1.5.2.post1"
+}
+2026-04-14 14:01:14 - evalscope - INFO: Start loading benchmark dataset: math_500
+2026-04-14 14:01:14 - evalscope - INFO: Start evaluating 5 subsets of math_500: ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+2026-04-14 14:01:14 - evalscope - INFO: Reusing predictions from ./outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 1.jsonl, got 43 predictions, remaining 43 samples
+2026-04-14 14:01:14 - evalscope - WARNING: [Rerun review mode] Skipping 43 samples in subset 'Level 1' due to missing cached predictions. They will NOT be inferred.
+2026-04-14 14:01:14 - evalscope - INFO: Reusing predictions from ./outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 2.jsonl, got 90 predictions, remaining 90 samples
+2026-04-14 14:01:14 - evalscope - WARNING: [Rerun review mode] Skipping 90 samples in subset 'Level 2' due to missing cached predictions. They will NOT be inferred.
+2026-04-14 14:01:14 - evalscope - INFO: Reusing predictions from ./outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 3.jsonl, got 105 predictions, remaining 105 samples
+2026-04-14 14:01:14 - evalscope - WARNING: [Rerun review mode] Skipping 105 samples in subset 'Level 3' due to missing cached predictions. They will NOT be inferred.
+2026-04-14 14:01:14 - evalscope - INFO: Reusing predictions from ./outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 4.jsonl, got 128 predictions, remaining 128 samples
+2026-04-14 14:01:14 - evalscope - WARNING: [Rerun review mode] Skipping 128 samples in subset 'Level 4' due to missing cached predictions. They will NOT be inferred.
+2026-04-14 14:01:14 - evalscope - INFO: Reusing predictions from ./outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 5.jsonl, got 134 predictions, remaining 134 samples
+2026-04-14 14:01:14 - evalscope - WARNING: [Rerun review mode] Skipping 134 samples in subset 'Level 5' due to missing cached predictions. They will NOT be inferred.
+2026-04-14 14:01:14 - evalscope - INFO: Unified pool: 500 items to process, 0 already fully cached (500 total across all subsets).
+2026-04-14 14:01:16 - evalscope - INFO: Evaluating[math_500] 100%| 500/500 [Elapsed: 00:02 < Remaining: 00:00, 70.93it/s]
+2026-04-14 14:01:16 - evalscope - INFO: Unified pool finished for math_500.
+2026-04-14 14:01:16 - evalscope - INFO: Aggregating scores for subset: Level 1
+2026-04-14 14:01:16 - evalscope - INFO: Aggregating scores for subset: Level 2
+2026-04-14 14:01:16 - evalscope - INFO: Aggregating scores for subset: Level 3
+2026-04-14 14:01:16 - evalscope - INFO: Aggregating scores for subset: Level 4
+2026-04-14 14:01:16 - evalscope - INFO: Aggregating scores for subset: Level 5
+2026-04-14 14:01:16 - evalscope - INFO: Generating report...
+2026-04-14 14:01:16 - evalscope - INFO: 
+math_500 report table:
+----------+-----------+----------+----------+-------+---------+---------+
+| Model    | Dataset   | Metric   | Subset   |   Num |   Score | Cat.0   |
+==========+===========+==========+==========+=======+=========+=========+
+| qwen3-8B | math_500  | mean_acc | Level 1  |    43 |  0.9767 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 2  |    90 |  0.9778 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 3  |   105 |  0.9714 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 4  |   128 |  0.9297 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 5  |   134 |  0.9478 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | OVERALL  |   500 |  0.956  | -       |
+----------+-----------+----------+----------+-------+---------+---------+ 
+2026-04-14 14:01:16 - evalscope - INFO: Skipping report analysis (`analysis_report=False`).
+2026-04-14 14:01:16 - evalscope - INFO: Dump report to: ./outputs/20260409_180006/reports/qwen3-8B/math_500.json 
+2026-04-14 14:01:16 - evalscope - INFO: Benchmark math_500 evaluation finished.
+2026-04-14 14:01:16 - evalscope - INFO: Running[eval] 100%| 1/1 [Elapsed: 00:02 < Remaining: 00:00,  2.67s/benchmark]
+2026-04-14 14:01:16 - evalscope - INFO: Overall report table: 
+----------+-----------+----------+----------+-------+---------+---------+
+| Model    | Dataset   | Metric   | Subset   |   Num |   Score | Cat.0   |
+==========+===========+==========+==========+=======+=========+=========+
+| qwen3-8B | math_500  | mean_acc | Level 1  |    43 |  0.9767 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 2  |    90 |  0.9778 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 3  |   105 |  0.9714 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 4  |   128 |  0.9297 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | Level 5  |   134 |  0.9478 | default |
+----------+-----------+----------+----------+-------+---------+---------+
+| qwen3-8B | math_500  | mean_acc | OVERALL  |   500 |  0.956  | -       |
+----------+-----------+----------+----------+-------+---------+---------+ 
+2026-04-14 14:01:17 - evalscope - INFO: HTML report generated: /data1/sunzhq/tmp/llm-benchmarks/tools/outputs/20260409_180006/reports/report.html
+2026-04-14 14:01:17 - evalscope - INFO: Finished evaluation for qwen3-8B on ['math_500']
+2026-04-14 14:01:17 - evalscope - INFO: Output directory: ./outputs/20260409_180006
--- a/tools/outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 1.jsonl
+++ b/tools/outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 1.jsonl
--- a/tools/outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 2.jsonl
+++ b/tools/outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 2.jsonl
--- a/tools/outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 3.jsonl
+++ b/tools/outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 3.jsonl
--- a/tools/outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 4.jsonl
+++ b/tools/outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 4.jsonl
--- a/tools/outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 5.jsonl
+++ b/tools/outputs/20260409_180006/predictions/qwen3-8B/math_500_Level 5.jsonl