eval_mulit_model.py

from mmengine.config import read_base

with read_base():
    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
    from opencompass.configs.datasets.livecodebench.livecodebench_gen_6966bc import LCB_datasets
    from opencompass.configs.datasets.math.math_500_gen import math_datasets
    from opencompass.configs.datasets.ceval.ceval_zero_shot_gen_bd40ef import ceval_datasets
    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_gen_cdbebf import mmlu_pro_datasets
    from opencompass.configs.datasets.humaneval.humaneval_gen import humaneval_datasets
    from opencompass.configs.summarizers.example import summarizer

datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
work_dir = '/workspace/logs/offline'  #输出日志路径


from opencompass.models import VLLM
from opencompass.models import VLLMwithChatTemplate

settings = [  # abbr, path, tp, enforce_eager, data_type, max_len, batch_size
    ('Qwen3-32B', '/models/qwen3/Qwen3-32B', 2, False, 'bfloat16', 32768, 32),
    ('Qwen3-30B-A3B', '/models/qwen3/Qwen3-30B-A3B', 2, False, 'bfloat16',32768, 32),
]

models = []
for abbr, path, tp, eager, data_type, max_len, batch_size in settings:
    models.append(
        dict(
            type=VLLMwithChatTemplate, # chat验证请用VLLMwithChatTemplate，base验证使用VLLM
            abbr=abbr,
            path=path,
            model_kwargs=dict(tensor_parallel_size=tp,
                              dtype=data_type,
                              max_model_len=max_len,
                              enforce_eager=eager,
                              gpu_memory_utilization=0.95,
                            # int4 模型请添加 quantization="awq" 或 quantization="gptq"
                              ),
            max_out_len=max_len,
            max_seq_len=max_len,
            batch_size=batch_size,
            pred_postprocessor=dict(
            type='opencompass.utils.text_postprocessors.extract_non_reasoning_content'),
            generation_kwargs=dict(temperature=0),
            run_cfg=dict(num_gpus=tp, num_procs=1),
        )
    )
infer = dict(
        partitioner=dict(
        type=NumWorkerPartitioner,
        num_worker=8,    # 每个模型划分出多少个task，建议不大于max_num_workers
        num_split=8,   # 每个数据集将被划分成多少份。若为 None，则使用 num_worker。
        min_task_size=16, # 每个划分的最小数据条目数
    ),
    runner=dict(
        type=LocalRunner,
        max_num_workers=8, #最大并行执行的task数，建议设置为 gpu数量 / 模型tp
        task=dict(type=OpenICLInferTask),  # 待运行的任务
    )
)