math_500.json 2.74 KB
Newer Older
sunzhq2's avatar
init  
sunzhq2 committed
1
2
3
4
5
6
{
    "name": "qwen3-8B@math_500",
    "dataset_name": "math_500",
    "dataset_pretty_name": "MATH-500",
    "dataset_description": "\n## Overview\n\nMATH-500 is a curated subset of 500 problems from the MATH benchmark, designed to evaluate the mathematical reasoning capabilities of language models. It covers five difficulty levels across various mathematical topics including algebra, geometry, number theory, and calculus.\n\n## Task Description\n\n- **Task Type**: Mathematical Problem Solving\n- **Input**: Mathematical problem statement\n- **Output**: Step-by-step solution with final numerical answer\n- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)\n\n## Key Features\n\n- 500 carefully selected problems from the full MATH dataset\n- Five difficulty levels for fine-grained evaluation\n- Problems cover algebra, geometry, number theory, probability, and more\n- Each problem includes a reference solution\n- Designed for efficient yet comprehensive math evaluation\n\n## Evaluation Notes\n\n- Default configuration uses **0-shot** evaluation\n- Answers should be formatted within `\\boxed{}` for proper extraction\n- Numeric equivalence checking for answer comparison\n- Results can be broken down by difficulty level\n- Commonly used for math reasoning benchmarking due to manageable size\n",
    "model_name": "qwen3-8B",
sunzhq2's avatar
sunzhq2 committed
7
    "score": 0.938,
sunzhq2's avatar
init  
sunzhq2 committed
8
9
10
11
    "metrics": [
        {
            "name": "mean_acc",
            "num": 500,
sunzhq2's avatar
sunzhq2 committed
12
13
            "score": 0.938,
            "macro_score": 0.938,
sunzhq2's avatar
init  
sunzhq2 committed
14
15
16
17
18
19
            "categories": [
                {
                    "name": [
                        "default"
                    ],
                    "num": 500,
sunzhq2's avatar
sunzhq2 committed
20
21
                    "score": 0.938,
                    "macro_score": 0.9434,
sunzhq2's avatar
init  
sunzhq2 committed
22
23
24
25
26
27
28
29
                    "subsets": [
                        {
                            "name": "Level 1",
                            "score": 0.9535,
                            "num": 43
                        },
                        {
                            "name": "Level 2",
sunzhq2's avatar
sunzhq2 committed
30
                            "score": 0.9778,
sunzhq2's avatar
init  
sunzhq2 committed
31
32
33
34
35
36
37
38
39
                            "num": 90
                        },
                        {
                            "name": "Level 3",
                            "score": 0.9524,
                            "num": 105
                        },
                        {
                            "name": "Level 4",
sunzhq2's avatar
sunzhq2 committed
40
                            "score": 0.9453,
sunzhq2's avatar
init  
sunzhq2 committed
41
42
43
44
45
46
47
48
49
50
51
52
53
54
                            "num": 128
                        },
                        {
                            "name": "Level 5",
                            "score": 0.8881,
                            "num": 134
                        }
                    ]
                }
            ]
        }
    ],
    "analysis": "N/A"
}