eval_aime24.py

# Support AIME-2024 with Repeat8
# Support MATH-500
# Support OlympiadBench
# Support OmniMath
# Support LiveMathBench-202412-Hard

import os.path as osp
from itertools import product
from opencompass.models import OpenAISDK
from mmengine.config import read_base
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
from opencompass.runners import LocalRunner
from opencompass.models import (
    TurboMindModelwithChatTemplate,
)

#######################################################################
#                          PART 1  Datasets List                      #
#######################################################################
with read_base():
    from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
    # Summarizer
    from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups

datasets = sum(
    (v for k, v in locals().items() if k.endswith('_datasets')),
    [],
)

# Set LLM Verifier used for each dataset

verifier_cfg = dict(
    abbr='Qwen3-32B',
    type=OpenAISDK,
    path='/models/qwen3/Qwen3-32B/', # You need to set your own judge model path
    key='EMPTY', # You need to set your own API key
    openai_api_base=[
        'http://0.0.0.0:8000/v1', # You need to set your own API base
    ],
    meta_template=dict(
        round=[
            dict(role='HUMAN', api_role='HUMAN'),
            dict(role='BOT', api_role='BOT', generate=True),
        ],
    ),
    query_per_second=16,
    batch_size=64,
    temperature=0.001,
    tokenizer_path='/models/qwen3/Qwen3-32B/',
    verbose=True,
    max_out_len=16384,
    # max_seq_len=32768,
    pred_postprocessor=dict(type=extract_non_reasoning_content),
    max_seq_len=40960,
)

for item in datasets:
   if 'judge_cfg' in item['eval_cfg']['evaluator']:
        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg


#######################################################################
#                          PART 2  Model List                         #
#######################################################################

models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

api_meta_template = dict(round=[
    dict(role='HUMAN', api_role='HUMAN'),
    dict(role='BOT', api_role='BOT', generate=True),
], )

models += [
    dict(
        abbr='DeepSeek-R1-INT8',
        type=OpenAISDK,
        path='/nvme/models/DeepSeek-R1-INT8/',
        openai_api_base="http://0.0.0.0:8000/v1",
        tokenizer_path="/nvme/models/DeepSeek-R1-INT8/",
        key='EMPTY',
        meta_template=api_meta_template,
        query_per_second=64,
        max_out_len=32768,
        max_seq_len=32768,
        temperature=0.7,
        pred_postprocessor=dict(type=extract_non_reasoning_content),
        batch_size=32),
]

#######################################################################
#                          PART 3  Inference/Evaluation               #
#######################################################################

# Inference configuration
infer = dict(
    partitioner=dict(
        type=NumWorkerPartitioner,
        num_worker=1
        # Similar with data-parallelism, how many workers for evaluation,
        # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
        # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
        # to max-utilize the GPUs.
        # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
    ),
    runner=dict(
        type=LocalRunner,
        task=dict(type=OpenICLInferTask)
    ),
)

# Evaluation configuration
eval = dict(
    partitioner=dict(
        type=NaivePartitioner, n=8
    ),
    runner=dict(
        type=LocalRunner,
        task=dict(
            type=OpenICLEvalTask)
    ),
)


#######################################################################
#                          PART 4  Summarizer                         #
#######################################################################


summary_groups = sum(
    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
)

summary_groups.extend([
    {
        'name': 'AIME2024-Aveage8',
        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
    },
    {
        'name': 'LiveMathBench-v202412-Hard-Aveage8',
        'subsets':[[
            f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy']
            for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
        ]
    }
])

# Summarizer
summarizer = dict(
    dataset_abbrs=[
        ['AIME2024-Aveage8', 'naive_average'],

    ],
    summary_groups=summary_groups,
)


#######################################################################
#                          PART 5  Utils                              #
#######################################################################

work_dir = "/workspace/logs/aime_r1_int8/"