# Support AIME-2024 with Repeat8 # Support MATH-500 # Support OlympiadBench # Support OmniMath # Support LiveMathBench-202412-Hard import os.path as osp from itertools import product from opencompass.models import OpenAISDK from mmengine.config import read_base from opencompass.utils.text_postprocessors import extract_non_reasoning_content from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask from opencompass.runners import LocalRunner from opencompass.models import ( TurboMindModelwithChatTemplate, ) ####################################################################### # PART 1 Datasets List # ####################################################################### with read_base(): from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run # Summarizer from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups datasets = sum( (v for k, v in locals().items() if k.endswith('_datasets')), [], ) # Set LLM Verifier used for each dataset verifier_cfg = dict( abbr='Qwen3-32B', type=OpenAISDK, path='/models/qwen3/Qwen3-32B/', # You need to set your own judge model path key='EMPTY', # You need to set your own API key openai_api_base=[ 'http://0.0.0.0:8000/v1', # You need to set your own API base ], meta_template=dict( round=[ dict(role='HUMAN', api_role='HUMAN'), dict(role='BOT', api_role='BOT', generate=True), ], ), query_per_second=16, batch_size=64, temperature=0.001, tokenizer_path='/models/qwen3/Qwen3-32B/', verbose=True, max_out_len=16384, # max_seq_len=32768, pred_postprocessor=dict(type=extract_non_reasoning_content), max_seq_len=40960, ) for item in datasets: if 'judge_cfg' in item['eval_cfg']['evaluator']: item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg ####################################################################### # PART 2 Model List # ####################################################################### models = sum([v for k, v in locals().items() if k.endswith('_model')], []) api_meta_template = dict(round=[ dict(role='HUMAN', api_role='HUMAN'), dict(role='BOT', api_role='BOT', generate=True), ], ) models += [ dict( abbr='DeepSeek-R1-INT8', type=OpenAISDK, path='/nvme/models/DeepSeek-R1-INT8/', openai_api_base="http://0.0.0.0:8000/v1", tokenizer_path="/nvme/models/DeepSeek-R1-INT8/", key='EMPTY', meta_template=api_meta_template, query_per_second=64, max_out_len=32768, max_seq_len=32768, temperature=0.7, pred_postprocessor=dict(type=extract_non_reasoning_content), batch_size=32), ] ####################################################################### # PART 3 Inference/Evaluation # ####################################################################### # Inference configuration infer = dict( partitioner=dict( type=NumWorkerPartitioner, num_worker=1 # Similar with data-parallelism, how many workers for evaluation, # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8 # to max-utilize the GPUs. # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4 ), runner=dict( type=LocalRunner, task=dict(type=OpenICLInferTask) ), ) # Evaluation configuration eval = dict( partitioner=dict( type=NaivePartitioner, n=8 ), runner=dict( type=LocalRunner, task=dict( type=OpenICLEvalTask) ), ) ####################################################################### # PART 4 Summarizer # ####################################################################### summary_groups = sum( [v for k, v in locals().items() if k.endswith('_summary_groups')], [] ) summary_groups.extend([ { 'name': 'AIME2024-Aveage8', 'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)] }, { 'name': 'LiveMathBench-v202412-Hard-Aveage8', 'subsets':[[ f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy'] for split, run_idx in product(['hard_cn', 'hard_en'], range(8)) ] } ]) # Summarizer summarizer = dict( dataset_abbrs=[ ['AIME2024-Aveage8', 'naive_average'], ], summary_groups=summary_groups, ) ####################################################################### # PART 5 Utils # ####################################################################### work_dir = "/workspace/logs/aime_r1_int8/"