analysis_report: false api_url: null chat_template: null dataset_args: math_500: aggregation: mean data_statistics: null dataset_id: AI-ModelScope/MATH-500 default_subset: default description: ' ## Overview MATH-500 is a curated subset of 500 problems from the MATH benchmark, designed to evaluate the mathematical reasoning capabilities of language models. It covers five difficulty levels across various mathematical topics including algebra, geometry, number theory, and calculus. ## Task Description - **Task Type**: Mathematical Problem Solving - **Input**: Mathematical problem statement - **Output**: Step-by-step solution with final numerical answer - **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest) ## Key Features - 500 carefully selected problems from the full MATH dataset - Five difficulty levels for fine-grained evaluation - Problems cover algebra, geometry, number theory, probability, and more - Each problem includes a reference solution - Designed for efficient yet comprehensive math evaluation ## Evaluation Notes - Default configuration uses **0-shot** evaluation - Answers should be formatted within `\boxed{}` for proper extraction - Numeric equivalence checking for answer comparison - Results can be broken down by difficulty level - Commonly used for math reasoning benchmarking due to manageable size ' eval_split: test extra_params: {} few_shot_num: 0 few_shot_prompt_template: null few_shot_random: false filters: null force_redownload: false metric_list: - acc: numeric: true name: math_500 output_types: - generation paper_url: null pretty_name: MATH-500 prompt_template: '{question} Please reason step by step, and put your final answer within \boxed{{}}.' query_template: null review_timeout: null sample_example: null sandbox_config: {} shuffle: false shuffle_choices: false subset_list: - Level 1 - Level 2 - Level 3 - Level 4 - Level 5 system_prompt: null tags: - Math - Reasoning train_split: null dataset_dir: /root/.cache/modelscope/hub/datasets dataset_hub: modelscope datasets: - math_500 debug: false enable_progress_tracker: false eval_backend: Native eval_batch_size: 1 eval_config: null eval_type: mock_llm evalscope_version: 1.5.2.post1 generation_config: batch_size: 1 ignore_errors: false judge_model_args: {} judge_strategy: auto judge_worker_num: 1 limit: null model: text_generation model_args: {} model_id: qwen3-8B model_task: text_generation no_timestamp: true repeats: 1 rerun_review: true sandbox_manager_config: {} sandbox_type: docker seed: 42 stream: null timeout: null use_cache: /data1/sunzhq/llm-benchmark/tools/evalscope-data use_sandbox: false work_dir: /data1/sunzhq/llm-benchmark/tools/evalscope-data