eval_deepseek_r1.py

# Support AIME-2024 with Repeat8
# Support MATH-500
# Support OlympiadBench
# Support OmniMath
# Support LiveMathBench-202412-Hard

import os.path as osp
from itertools import product
from opencompass.models import OpenAISDK
from mmengine.config import read_base
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
from opencompass.runners import LocalRunner
from opencompass.models import (
    TurboMindModelwithChatTemplate,
)

#######################################################################
#                          PART 1  Datasets List                      #
#######################################################################
with read_base():
    # You can comment out the datasets you don't want to evaluate

    # Datasets
    # from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
    from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
    # from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
    # from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
    # from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets


    # Summarizer
    from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups

datasets = sum(
    (v for k, v in locals().items() if k.endswith('_datasets')),
    [],
)

# Set LLM Verifier used for each dataset

verifier_cfg = dict(
        abbr='qwen2-5-32B-Instruct',
        type=OpenAISDK,
        path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
        key='sk-1234', # You need to set your own API key
        openai_api_base=[
            'http://172.30.56.1:4000/v1', # You need to set your own API base
        ],
        meta_template=dict(
            round=[
                dict(role='HUMAN', api_role='HUMAN'),
                dict(role='BOT', api_role='BOT', generate=True),
            ], 
        ),
        query_per_second=16,
        batch_size=1024,
        temperature=0.001,
        tokenizer_path='gpt-4o-2024-05-13',
        verbose=True,
        max_out_len=16384,
        # max_seq_len=32768,
        max_seq_len=49152,
)

for item in datasets:
    # item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
    if 'judge_cfg' in item['eval_cfg']['evaluator']:
        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg


#######################################################################
#                          PART 2  Model List                         #
#######################################################################

models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

models += [
    # You can comment out the models you don't want to evaluate
    # All models use sampling mode
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='deepseek-r1-distill-qwen-7b-turbomind',
        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
        gen_config=dict(
                        do_sample=True,
                        temperature=0.6,
                        top_p=0.95,
                        max_new_tokens=32768),
        max_seq_len=32768,
        max_out_len=32768,
        batch_size=64,
        run_cfg=dict(num_gpus=1),
        pred_postprocessor=dict(type=extract_non_reasoning_content)
    ),
    # dict(
    #     type=TurboMindModelwithChatTemplate,
    #     abbr='deepseek-r1-distill-qwen-14b-turbomind',
    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
    #     gen_config=dict(
    #                     do_sample=True,
    #                     temperature=0.6,
    #                     top_p=0.95,
    #                     max_new_tokens=32768),
    #     max_seq_len=32768,
    #     max_out_len=32768,
    #     batch_size=128,
    #     run_cfg=dict(num_gpus=2),
    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
    # ),
    # dict(
    #     type=TurboMindModelwithChatTemplate,
    #     abbr='deepseek-r1-distill-qwen-32b-turbomind',
    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
    #     gen_config=dict(
    #                     do_sample=True,
    #                     temperature=0.6,
    #                     top_p=0.95,
    #                     max_new_tokens=16384),
    #     max_seq_len=32768,
    #     max_out_len=16384,
    #     batch_size=128,
    #     run_cfg=dict(num_gpus=4),
    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
    # ),
]

#######################################################################
#                          PART 3  Inference/Evaluation               #
#######################################################################

# Inference configuration
infer = dict(
    partitioner=dict(
        type=NumWorkerPartitioner,
        num_worker=1
        # Similar with data-parallelism, how many workers for evaluation,
        # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
        # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
        # to max-utilize the GPUs.
        # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
    ),
    runner=dict(
        type=LocalRunner,
        task=dict(type=OpenICLInferTask)
    ),
)

# Evaluation configuration
eval = dict(
    partitioner=dict(
        type=NaivePartitioner, n=8
    ),
    runner=dict(
        type=LocalRunner,
        task=dict(
            type=OpenICLEvalTask)
    ),
)


#######################################################################
#                          PART 4  Summarizer                         #
#######################################################################


summary_groups = sum(
    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
)

summary_groups.extend([
    {
        'name': 'AIME2024-Aveage8',
        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
    },
    {
        'name': 'LiveMathBench-v202412-Hard-Aveage8',
        'subsets':[[
            f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy'] 
                for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
        ]
    }
])

# Summarizer
summarizer = dict(
    dataset_abbrs=[
        'MATH',
        # ['LiveMathBench-k1-n1', 'pass@1'],
        # ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
        # ['aime2024', 'accuracy'],
        ['math_prm800k_500-llmjudge', 'accuracy'],
        ['AIME2024-Aveage8', 'naive_average'],
        ['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
        ['OlympiadBenchMath', 'accuracy'],
        ['OmniMath', 'accuracy'],
    ],
    summary_groups=summary_groups,
)


#######################################################################
#                          PART 5  Utils                              #
#######################################################################

work_dir = 'outputs/deepseek_r1_reasoning'