eval_log.log

2026-04-14 14:09:21 - evalscope - INFO: Running with native backend
2026-04-14 14:09:21 - evalscope - INFO: Dump task config to ./outputs/20260414_140921/configs/task_config.yaml
2026-04-14 14:09:21 - evalscope - INFO: {
    "model": "qwen3-8B",
    "model_id": "qwen3-8B",
    "model_args": {},
    "model_task": "text_generation",
    "chat_template": null,
    "datasets": [
        "math_500"
    ],
    "dataset_args": {
        "math_500": {
            "name": "math_500",
            "dataset_id": "../MATH-500",
            "output_types": [
                "generation"
            ],
            "subset_list": [
                "Level 1"
            ],
            "default_subset": "default",
            "few_shot_num": 0,
            "few_shot_random": false,
            "train_split": null,
            "eval_split": "test",
            "prompt_template": "{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
            "few_shot_prompt_template": null,
            "system_prompt": null,
            "query_template": null,
            "pretty_name": "MATH-500",
            "description": "\n## Overview\n\nMATH-500 is a curated subset of 500 problems from the MATH benchmark, designed to evaluate the mathematical reasoning capabilities of language models. It covers five difficulty levels across various mathematical topics including algebra, geometry, number theory, and calculus.\n\n## Task Description\n\n- **Task Type**: Mathematical Problem Solving\n- **Input**: Mathematical problem statement\n- **Output**: Step-by-step solution with final numerical answer\n- **Difficulty Levels**: Level 1 (easiest) to Level 5 (hardest)\n\n## Key Features\n\n- 500 carefully selected problems from the full MATH dataset\n- Five difficulty levels for fine-grained evaluation\n- Problems cover algebra, geometry, number theory, probability, and more\n- Each problem includes a reference solution\n- Designed for efficient yet comprehensive math evaluation\n\n## Evaluation Notes\n\n- Default configuration uses **0-shot** evaluation\n- Answers should be formatted within `\\boxed{}` for proper extraction\n- Numeric equivalence checking for answer comparison\n- Results can be broken down by difficulty level\n- Commonly used for math reasoning benchmarking due to manageable size\n",
            "paper_url": null,
            "data_statistics": null,
            "sample_example": null,
            "tags": [
                "Math",
                "Reasoning"
            ],
            "filters": null,
            "metric_list": [
                {
                    "acc": {
                        "numeric": true
                    }
                }
            ],
            "aggregation": "mean",
            "shuffle": false,
            "shuffle_choices": false,
            "force_redownload": false,
            "review_timeout": null,
            "extra_params": {},
            "sandbox_config": {}
        }
    },
    "dataset_dir": "/root/.cache/modelscope/hub/datasets",
    "dataset_hub": "modelscope",
    "repeats": 1,
    "generation_config": {
        "timeout": 1800.0,
        "batch_size": 32,
        "temperature": 0.0
    },
    "eval_type": "openai_api",
    "eval_backend": "Native",
    "eval_config": null,
    "limit": null,
    "eval_batch_size": 32,
    "use_cache": null,
    "rerun_review": false,
    "work_dir": "./outputs/20260414_140921",
    "no_timestamp": false,
    "enable_progress_tracker": false,
    "ignore_errors": false,
    "debug": false,
    "seed": 42,
    "api_url": "http://0.0.0.0:8000/v1/chat/completions",
    "timeout": 1800.0,
    "stream": null,
    "judge_strategy": "auto",
    "judge_worker_num": 1,
    "judge_model_args": {},
    "analysis_report": false,
    "use_sandbox": false,
    "sandbox_type": "docker",
    "sandbox_manager_config": {},
    "evalscope_version": "1.5.2.post1"
}
2026-04-14 14:09:21 - evalscope - INFO: Start loading benchmark dataset: math_500
2026-04-14 14:09:21 - evalscope - INFO: Loading dataset ../MATH-500 from local > subset: default > split: test ...
2026-04-14 14:09:21 - evalscope - INFO: Start evaluating 1 subsets of math_500: ['Level 1']
2026-04-14 14:09:21 - evalscope - INFO: Unified pool: 43 items to process, 0 already fully cached (43 total across all subsets).
2026-04-14 14:09:21 - evalscope - INFO: Loading model for prediction...
2026-04-14 14:09:21 - evalscope - INFO: Creating model qwen3-8B with eval_type=openai_api base_url=http://0.0.0.0:8000/v1/chat/completions, config={'timeout': 1800.0, 'retries': 5, 'retry_interval': 10, 'batch_size': 32, 'temperature': 0.0}, model_args={}
2026-04-14 14:09:21 - evalscope - INFO: Model loaded successfully.
2026-04-14 14:09:42 - evalscope - INFO: Evaluating[math_500]  14%| 6/43 [Elapsed: 00:20 < Remaining: 02:26,  3.95s/it]
2026-04-14 14:09:42 - evalscope - INFO: Running[eval]   0%| 0/1 [Elapsed: 00:21 < Remaining: ?, ?benchmark/s]