task_config.yaml

analysis_report: false
api_url: null
chat_template: null
dataset_args:
  math_500:
    aggregation: mean
    data_statistics: null
    dataset_id: AI-ModelScope/MATH-500
    default_subset: default
    description: '

      ## Overview


      MATH-500 is a curated subset of 500 problems from the MATH benchmark, designed
      to evaluate the mathematical reasoning capabilities of language models. It covers
      five difficulty levels across various mathematical topics including algebra,
      geometry, number theory, and calculus.


      ## Task Description


      - **Task Type**: Mathematical Problem Solving

      - **Input**: Mathematical problem statement

      - **Output**: Step-by-step solution with final numerical answer

      - **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)


      ## Key Features


      - 500 carefully selected problems from the full MATH dataset

      - Five difficulty levels for fine-grained evaluation

      - Problems cover algebra, geometry, number theory, probability, and more

      - Each problem includes a reference solution

      - Designed for efficient yet comprehensive math evaluation


      ## Evaluation Notes


      - Default configuration uses **0-shot** evaluation

      - Answers should be formatted within `\boxed{}` for proper extraction

      - Numeric equivalence checking for answer comparison

      - Results can be broken down by difficulty level

      - Commonly used for math reasoning benchmarking due to manageable size

      '
    eval_split: test
    extra_params: {}
    few_shot_num: 0
    few_shot_prompt_template: null
    few_shot_random: false
    filters: null
    force_redownload: false
    metric_list:
    - acc:
        numeric: true
    name: math_500
    output_types:
    - generation
    paper_url: null
    pretty_name: MATH-500
    prompt_template: '{question}

      Please reason step by step, and put your final answer within \boxed{{}}.'
    query_template: null
    review_timeout: null
    sample_example: null
    sandbox_config: {}
    shuffle: false
    shuffle_choices: false
    subset_list:
    - Level 1
    - Level 2
    - Level 3
    - Level 4
    - Level 5
    system_prompt: null
    tags:
    - Math
    - Reasoning
    train_split: null
dataset_dir: /root/.cache/modelscope/hub/datasets
dataset_hub: modelscope
datasets:
- math_500
debug: false
enable_progress_tracker: false
eval_backend: Native
eval_batch_size: 1
eval_config: null
eval_type: mock_llm
evalscope_version: 1.5.2.post1
generation_config:
  batch_size: 1
ignore_errors: false
judge_model_args: {}
judge_strategy: auto
judge_worker_num: 1
limit: null
model: text_generation
model_args: {}
model_id: qwen3-8B
model_task: text_generation
no_timestamp: true
repeats: 1
rerun_review: true
sandbox_manager_config: {}
sandbox_type: docker
seed: 42
stream: null
timeout: null
use_cache: /data1/sunzhq/llm-benchmark/tools/evalscope-data
use_sandbox: false
work_dir: /data1/sunzhq/llm-benchmark/tools/evalscope-data