Initial commit

be3dfa50 · jerrrrry · be3dfa50 · be3dfa50 · be3dfa50 · be3dfa50
Commit be3dfa50 authored Aug 06, 2025 by jerrrrry
20 changed files
--- a/examples/eval_internlm_math_chat.py
+++ b/examples/eval_internlm_math_chat.py
+from mmengine.config import read_base
+
+from opencompass.models.huggingface import HuggingFaceCausalLM
+
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets
+    from opencompass.configs.datasets.math.math_gen_736506 import math_datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_math_7b import \
+        models as internlm_math_chat_7b_models
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_math_20b import \
+        models as internlm_math_chat_20b_models
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+# Eval Math and GSM8k for both Internlm-Math-Chat-7B and 20b
+datasets = [*math_datasets, *gsm8k_datasets]
+models = [*internlm_math_chat_7b_models, *internlm_math_chat_20b_models]
--- a/examples/eval_internlm_turbomind.py
+++ b/examples/eval_internlm_turbomind.py
+from mmengine.config import read_base
+
+from opencompass.models.turbomind import TurboMindModel
+
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
+        ceval_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
+        humaneval_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
+        WiC_datasets
+    from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import \
+        triviaqa_datasets
+    # and output the results in a choosen format
+    from opencompass.configs.summarizers.medium import summarizer
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+# # config for internlm-7b model
+internlm_7b = dict(
+    type=TurboMindModel,
+    abbr='internlm-7b-turbomind',
+    path='internlm/internlm-7b',
+    engine_config=dict(session_len=2048,
+                       max_batch_size=32,
+                       rope_scaling_factor=1.0),
+    gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
+    max_out_len=100,
+    max_seq_len=2048,
+    batch_size=32,
+    concurrency=32,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+)
+
+# config for internlm-20b model
+internlm_20b = dict(
+    type=TurboMindModel,
+    abbr='internlm-20b-turbomind',
+    path='internlm/internlm-20b',
+    engine_config=dict(session_len=2048,
+                       max_batch_size=8,
+                       rope_scaling_factor=1.0),
+    gen_config=dict(top_k=1, top_p=0.8, temperature=1.0, max_new_tokens=100),
+    max_out_len=100,
+    max_seq_len=2048,
+    batch_size=8,
+    concurrency=8,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+)
+
+models = [internlm_20b]
--- a/examples/eval_judgerbench.py
+++ b/examples/eval_judgerbench.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.subjective.judgerbench.judgerbench import judgerbench_datasets
+
+from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
+                                HuggingFaceChatGLM3, OpenAI,
+                                TurboMindModelwithChatTemplate)
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='CompassJudger-1-7B-Instruct',
+        path='opencompass/CompassJudger-1-7B-Instruct',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]
+
+datasets = judgerbench_datasets
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(
+        type=NaivePartitioner,
+        n=10,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLEvalTask)),
+)
+
+work_dir = 'outputs/judgerbench/'
--- a/examples/eval_korbench.py
+++ b/examples/eval_korbench.py
+from mmengine import read_base
+
+with read_base():
+    from opencompass.configs.datasets.korbench.korbench_mixed_gen_d00bdd import \
+        korbench_mixed_datasets as mixed_datasets
+    from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
+        korbench_0shot_single_datasets as zero_shot_datasets
+    from opencompass.configs.datasets.korbench.korbench_single_3_shot_gen import \
+        korbench_3shot_single_datasets as three_shot_datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
+        models as hf_internlm2_5_7b
+
+datasets = zero_shot_datasets + three_shot_datasets + mixed_datasets
+models = hf_internlm2_5_7b
--- a/examples/eval_lightllm.py
+++ b/examples/eval_lightllm.py
+from mmengine.config import read_base
+
+from opencompass.models import LightllmAPI
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_a82cae import \
+        humaneval_datasets
+    from opencompass.configs.summarizers.leaderboard import summarizer
+
+datasets = [*humaneval_datasets]
+'''
+# Prompt template for InternLM2-Chat
+# https://github.com/InternLM/InternLM/blob/main/chat/chat_format.md
+
+_meta_template = dict(
+    begin='<|im_start|>system\nYou are InternLM2-Chat, a harmless AI assistant<|im_end|>\n',
+    round=[
+        dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
+        dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
+    ]
+)
+'''
+
+_meta_template = None
+
+models = [
+    dict(
+        abbr='LightllmAPI',
+        type=LightllmAPI,
+        url='http://localhost:1030/generate',
+        meta_template=_meta_template,
+        batch_size=32,
+        max_workers_per_task=128,
+        rate_per_worker=1024,
+        retry=4,
+        generation_kwargs=dict(do_sample=False,
+                               ignore_eos=False,
+                               max_new_tokens=1024),
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=32,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
--- a/examples/eval_livestembench.py
+++ b/examples/eval_livestembench.py
+from mmengine.config import read_base
+
+from opencompass.models import OpenAISDK
+
+with read_base():
+    # 选择一个数据集列表
+    from opencompass.configs.datasets.livestembench.livestembench_gen_3e3c50 import \
+        livestembench_datasets
+    # 选择一个感兴趣的模型
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
+        models as qwen2_5_7b_instruct_lmdeploy_model
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
+        models as qwen2_5_72b_instruct_lmdeploy_model
+
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+models = [
+    *qwen2_5_7b_instruct_lmdeploy_model, *qwen2_5_72b_instruct_lmdeploy_model
+]
+
+# Judge 模型配置
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], )
+
+judge_cfg = dict(
+    abbr='qwen2-5-72b-instruct',
+    type=OpenAISDK,
+    path='YOUR_SERVER_MODEL_NAME',  # 你的部署的模型名称
+    key='None',
+    openai_api_base=[
+        'http://localhost:23333/v1',  # 你的模型部署的地址
+    ],
+    meta_template=api_meta_template,
+    query_per_second=16,
+    batch_size=16,
+    temperature=0.001,
+    max_completion_tokens=32768,
+)
+
+for dataset in datasets:
+    dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+
+# -------------Inferen Stage ----------------------------------------
+
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(type=LocalRunner,
+                max_num_workers=8,
+                task=dict(type=OpenICLInferTask)),
+)
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=256,
+        task=dict(type=OpenICLEvalTask),
+    ),
+)
+
+work_dir = './outputs/livestembench'
--- a/examples/eval_llama2_7b.py
+++ b/examples/eval_llama2_7b.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.collections.base_medium_llama import (
+        piqa_datasets, siqa_datasets)
+    from opencompass.configs.models.llama.llama2_7b import models
+
+datasets = [*piqa_datasets, *siqa_datasets]
--- a/examples/eval_llama2_7b_lveval.py
+++ b/examples/eval_llama2_7b_lveval.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.lveval.lveval import \
+        LVEval_datasets as datasets
+    from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models
+    from opencompass.configs.summarizers.lveval import summarizer
+
+models[0]['path'] = '/path/to/your/huggingface_models/Llama-2-7b-chat-hf'
+models[0][
+    'tokenizer_path'] = '/path/to/your/huggingface_models/Llama-2-7b-chat-hf'
+models[0]['max_seq_len'] = 4096
+models[0]['generation_kwargs'] = dict(do_sample=False)
+models[0]['mode'] = 'mid'  # truncate in the middle
--- a/examples/eval_llama3_instruct.py
+++ b/examples/eval_llama3_instruct.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.dataset_collections.chat_OC15 import datasets
+    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
+        models as hf_llama3_8b_instruct_model
+    from opencompass.configs.summarizers.chat_OC15 import summarizer
+
+work_dir = 'outputs/debug/llama3-instruct'
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+# dataset               version    metric                        mode    llama-3-8b-instruct-hf
+# --------------------  ---------  ----------------------------  ------  ------------------------
+# average               -          naive_average                 gen     55.64
+# mmlu                  -          naive_average                 gen     68.30
+# cmmlu                 -          naive_average                 gen     53.29
+# ceval                 -          naive_average                 gen     52.32
+# GaokaoBench           -          weighted_average              gen     45.91
+# triviaqa_wiki_1shot   eaf81e     score                         gen     79.01
+# nq_open_1shot         01cf41     score                         gen     30.25
+# race-high             9a54b6     accuracy                      gen     81.22
+# winogrande            b36770     accuracy                      gen     66.46
+# hellaswag             e42710     accuracy                      gen     74.33
+# bbh                   -          naive_average                 gen     67.25
+# gsm8k                 1d7fe4     accuracy                      gen     79.08
+# math                  393424     accuracy                      gen     27.78
+# TheoremQA             6f0af8     score                         gen     19.50
+# openai_humaneval      8e312c     humaneval_pass@1              gen     55.49
+# sanitized_mbpp        830460     score                         gen     66.54
+# GPQA_diamond          4baadb     accuracy                      gen     25.76
+# IFEval                3321a3     Prompt-level-strict-accuracy  gen     67.84
+#                       -          -                             -       -
+# mmlu                  -          naive_average                 gen     68.30
+# mmlu-stem             -          naive_average                 gen     57.92
+# mmlu-social-science   -          naive_average                 gen     77.83
+# mmlu-humanities       -          naive_average                 gen     71.20
+# mmlu-other            -          naive_average                 gen     71.79
+# cmmlu                 -          naive_average                 gen     53.29
+# cmmlu-stem            -          naive_average                 gen     45.40
+# cmmlu-social-science  -          naive_average                 gen     54.63
+# cmmlu-humanities      -          naive_average                 gen     54.14
+# cmmlu-other           -          naive_average                 gen     59.52
+# cmmlu-china-specific  -          naive_average                 gen     49.33
+# ceval                 -          naive_average                 gen     52.32
+# ceval-stem            -          naive_average                 gen     48.16
+# ceval-social-science  -          naive_average                 gen     57.50
+# ceval-humanities      -          naive_average                 gen     53.26
+# ceval-other           -          naive_average                 gen     54.26
+# ceval-hard            -          naive_average                 gen     35.59
--- a/examples/eval_llm_compression.py
+++ b/examples/eval_llm_compression.py
+from mmengine.config import read_base
+
+with read_base():
+    # LLM compression datasets
+    from opencompass.configs.datasets.llm_compression.llm_compression import llm_compression_datasets
+
+    # Model configs
+    from opencompass.configs.models.qwen.hf_qwen1_5_7b import models as qwen1_5_7b
+    from opencompass.configs.models.qwen.hf_qwen1_5_14b import models as qwen1_5_14b
+    from opencompass.configs.models.hf_llama.hf_llama2_7b import models as llama2_7b
+    from opencompass.configs.models.hf_llama.hf_llama2_13b import models as llama2_13b
+
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.summarizers import LLMCompressionSummarizer
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+# -------------Inference Stage ----------------------------------------
+datasets = [*llm_compression_datasets]
+workdir = 'outputs/llm_compression'
+
+models = [
+    *qwen1_5_7b,
+    *qwen1_5_14b,
+    *llama2_7b,
+    *llama2_13b,
+]
+
+# Set custom batch_size and num_gpus for faster loss calculation
+# Smaller batch_size should give more precise results, at the cost of worse performance
+model_cfg = dict(batch_size=8, run_cfg=dict(num_gpus=4, num_procs=1))
+
+for mdl in models:
+    mdl.update(model_cfg)
+
+infer = dict(
+    # The OpenCompass implementation of BPC currently only supports NaivePartitioner, as the sliding window approach requires the dataset to be loaded sequentially. Using other partitioner types may produce incorrect results.
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(type=OpenICLInferTask),
+        max_num_workers=256,  # Maximum concurrent evaluation task count
+    ),
+)
+
+# -------------Evaluation Stage ----------------------------------------
+eval = dict(partitioner=dict(type=NaivePartitioner),
+            runner=dict(
+                type=LocalRunner,
+                task=dict(type=OpenICLEvalTask),
+                max_num_workers=256,
+            ))
+
+# -------------Summarization Stage ----------------------------------------
+summarizer = dict(type=LLMCompressionSummarizer)
--- a/examples/eval_llm_judge.py
+++ b/examples/eval_llm_judge.py
+from mmengine.config import read_base
+from opencompass.models.openai_api import OpenAISDK
+
+# Import pre-configured models from OpenCompass
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as lmdeploy_qwen2_5_14b_instruct_model,
+    )
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import CustomDataset
+
+
+# Dataset reader configuration
+math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
+
+# Inference configuration
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+# Template for the LLM judge
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration using LLM as judge
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='opencompass/math',
+            file_name='test_prm800k_500.jsonl',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=lmdeploy_qwen2_5_14b_instruct_model[0],
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+# Dataset configuration
+datasets = [
+    dict(
+        type=CustomDataset,
+        path='opencompass/math',
+        file_name='test_prm800k_500.jsonl',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+
+# Model to be evaluated
+models = lmdeploy_qwen2_5_7b_instruct_model
+
+# Limiting test to first 8 examples for quick testing
+math_reader_cfg['test_range'] = '[0:8]'
+
+# Output directory
+work_dir = 'outputs/llm_judge'
--- a/examples/eval_lmdeploy_demo.py
+++ b/examples/eval_lmdeploy_demo.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \
+        gsm8k_datasets
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_1_8b_chat import \
+        models
+
+datasets = gsm8k_datasets
+models = models
--- a/examples/eval_longbenchv2.py
+++ b/examples/eval_longbenchv2.py
+from mmengine.config import read_base
+
+with read_base():
+    # Models
+    # Datasets
+    from opencompass.configs.datasets.longbenchv2.longbenchv2_gen import \
+        LongBenchv2_datasets as LongBenchv2_datasets
+    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
+        models as lmdeploy_glm4_9b_chat_model
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
+        models as lmdeploy_llama3_1_8b_instruct_model
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
+        models as lmdeploy_qwen2_5_7b_instruct_model
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+for model in models:
+    model['max_seq_len'] = 128 * 1024
+    model['engine_config']['session_len'] = 128 * 1024
+    model['engine_config']['tp'] = 2
+    model['run_cfg']['num_gpus'] = 2
+    # Drop middle tokens to make input length shorter than session_len, use 128k to keep sync with Longbenchv2 original code
+    # Drop middle now only support LMDeploy models
+    model['drop_middle'] = True
+
+work_dir = './outputs/longbenchv2'
--- a/examples/eval_math_llm_judge.py
+++ b/examples/eval_math_llm_judge.py
+# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import models as hf_llama3_8b_instruct_model  # noqa: F401, F403
+    from opencompass.configs.models.hf_llama.hf_llama3_70b_instruct import models as hf_llama3_70b_instruct_model  # noqa: F401, F403
+    from opencompass.configs.datasets.math.math_llm_judge import math_datasets  # noqa: F401, F403
+
+from opencompass.datasets import math_judement_preprocess
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.summarizers import AllObjSummarizer
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+# -------------Prompt Settings ----------------------------------------
+eng_obj_prompt = """
+Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
+
+Examples:
+
+    Expression 1: $2x+3$
+    Expression 2: $3+2x$
+
+[Yes]
+
+    Expression 1: 3/2
+    Expression 2: 1.5
+
+[Yes]
+
+    Expression 1: $x^2+2x+1$
+    Expression 2: $y^2+2y+1$
+
+[No]
+
+    Expression 1: $x^2+2x+1$
+    Expression 2: $(x+1)^2$
+
+[Yes]
+
+    Expression 1: 3245/5
+    Expression 2: 649
+
+[No]
+(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
+
+    Expression 1: 2/(-3)
+    Expression 2: -2/3
+
+[Yes]
+(trivial simplifications are allowed)
+
+    Expression 1: 72 degrees
+    Expression 2: 72
+
+[Yes]
+(give benefit of the doubt to units)
+
+    Expression 1: 64
+    Expression 2: 64 square feet
+
+[Yes]
+(give benefit of the doubt to units)
+
+    Expression 1: 64
+    Expression 2:
+
+[No]
+(only mark as equivalent if both expressions are nonempty)
+
+---
+
+YOUR TASK
+
+
+Respond with only "[Yes]" or "[No]" (without quotes). Do not include a rationale.
+    Expression 1: {obj_gold}
+    Expression 2: {prediction}
+
+"""
+
+# -------------Inferen Stage ----------------------------------------
+# eval models
+models = [*hf_llama3_8b_instruct_model]
+# judge models
+judge_models = hf_llama3_70b_instruct_model
+
+eng_datasets = [*math_datasets]
+chn_datasets = []
+datasets = eng_datasets + chn_datasets
+work_dir = 'outputs/obj_all/'
+
+for d in eng_datasets:
+    d['eval_cfg'] = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            # If you need to preprocess the prediction before judging,
+            # you can specify the pred_postprocessor function here
+            pred_postprocessor=dict(type=math_judement_preprocess),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(role='HUMAN', prompt=eng_obj_prompt),
+                ]),
+            ),
+        ),
+        pred_role='BOT',
+    )
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=40000),
+    runner=dict(type=LocalRunner,
+                max_num_workers=256,
+                task=dict(type=OpenICLInferTask)),
+)
+
+# ------------- Evaluation Configuration --------------------------------
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveSizePartitioner,
+        max_task_size=80000,
+        mode='singlescore',
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=SubjectiveEvalTask)),
+)
+
+summarizer = dict(type=AllObjSummarizer)
--- a/examples/eval_math_llm_judge_internal.py
+++ b/examples/eval_math_llm_judge_internal.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.math.math_0shot_llm_judge_v2_gen_31d777 import \
+        math_datasets
+    # 选择一个感兴趣的模型
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
+        models as qwen2_5_72b_instruct_model
+
+eval_model_name = 'eval_model_name'
+postprocessor_model_name = 'postprocessor_model_name'
+eval_model_urls = ['http://0.0.0.0:23333/v1']
+postprocessor_model_urls = ['http://0.0.0.0:23333/v1']
+
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+for dataset in datasets:
+    dataset['eval_cfg']['evaluator']['model_name'] = eval_model_name
+    dataset['eval_cfg']['evaluator']['url'] = eval_model_urls
+    dataset['eval_cfg']['evaluator']['post_url'] = postprocessor_model_urls
+    dataset['eval_cfg']['evaluator'][
+        'post_model_name'] = postprocessor_model_name
+
+# -------------Inferen Stage ----------------------------------------
+
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(type=LocalRunner,
+                max_num_workers=8,
+                task=dict(type=OpenICLInferTask)),
+)
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner,
+                max_num_workers=256,
+                task=dict(type=OpenICLEvalTask)),
+)
--- a/examples/eval_math_verify.py
+++ b/examples/eval_math_verify.py
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+with read_base():
+    from opencompass.configs.datasets.math.math_500_gen import math_datasets
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-llama-8b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
+        engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
+        gen_config=dict(
+            temperature=0.6,
+            top_p=0.95,
+            max_new_tokens=32768,
+            do_sample=True,
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-14b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
+        gen_config=dict(
+            top_k=1,
+            temperature=0.6,
+            top_p=0.95,
+            max_new_tokens=32768,
+            do_sample=True,
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+]
+
+datasets = [*math_datasets]
+
+
+work_dir = './outputs/math_500'
--- a/examples/eval_mathbench.py
+++ b/examples/eval_mathbench.py
+from mmengine.config import read_base
+
+with read_base():
+
+    # Import models
+    # Import datasets
+    from opencompass.configs.datasets.MathBench.mathbench_gen import \
+        mathbench_datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
+        models as internlm2_chat_7b_model
+    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
+        models as llama3_8b_instruct_model
+    # Import summarizers for display results
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        summarizer  # Grouped results for MathBench-A and MathBench-T separately
+
+    # from opencompass.configs.summarizers.mathbench_v1 import summarizer # Detailed results for every sub-dataset
+    # from opencompass.configs.summarizers.groups.mathbench_v1_2024_lang import summarizer # Grouped results for bilingual results
+
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=8),
+    runner=dict(type=LocalRunner,
+                max_num_workers=256,
+                task=dict(type=OpenICLEvalTask)),
+)
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
+    runner=dict(type=LocalRunner,
+                max_num_workers=256,
+                task=dict(type=OpenICLInferTask)),
+)
+
+work_dir = './outputs/mathbench_results'
--- a/examples/eval_mmlu_cf.py
+++ b/examples/eval_mmlu_cf.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.mmlu_cf.mmlu_cf_gen_040615 import \
+        mmlu_cf_datasets
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
+        models as lmdeploy_llama3_8b_instruct_model
+    from opencompass.configs.models.qwen2_5.hf_qwen2_5_7b_instruct import \
+        models as hf_qwen2_5_7b_instruct_model
+    from opencompass.configs.summarizers.mmlu_cf import summarizer
+
+datasets = sum([
+    v
+    for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
+], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(type=LocalRunner,
+                max_num_workers=8,
+                task=dict(type=OpenICLInferTask)),
+)
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner,
+                max_num_workers=256,
+                task=dict(type=OpenICLEvalTask)),
+)
+
+work_dir = 'outputs/debug/mmlu_cf'
--- a/examples/eval_mmlu_pro.py
+++ b/examples/eval_mmlu_pro.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_gen_cdbebf import \
+        mmlu_pro_datasets
+    from opencompass.configs.internal.clusters.local import eval
+    from opencompass.configs.internal.clusters.local import \
+        infer_num_worker as infer
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
+        models as lmdeploy_llama3_8b_instruct_model
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
+        models as lmdeploy_qwen2_7b_instruct_model
+    from opencompass.configs.summarizers.mmlu_pro import summarizer
+
+datasets = sum([
+    v
+    for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
+], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+work_dir = 'outputs/debug/mmlu_pro'
+
+# dataset                    version    metric         mode      qwen2-7b-instruct-turbomind    llama-3-8b-instruct-turbomind
+# -------------------------  ---------  -------------  ------  -----------------------------  -------------------------------
+# mmlu_pro                   -          naive_average  gen                             46.18                            43.92
+# mmlu_pro_biology           736233     accuracy       gen                             63.74                            64.02
+# mmlu_pro_business          736233     accuracy       gen                             53.23                            46.01
+# mmlu_pro_chemistry         736233     accuracy       gen                             35.25                            32.42
+# mmlu_pro_computer_science  736233     accuracy       gen                             47.07                            44.88
+# mmlu_pro_economics         736233     accuracy       gen                             59.00                            53.79
+# mmlu_pro_engineering       736233     accuracy       gen                             26.73                            33.54
+# mmlu_pro_health            736233     accuracy       gen                             47.31                            51.34
+# mmlu_pro_history           736233     accuracy       gen                             42.78                            42.26
+# mmlu_pro_law               736233     accuracy       gen                             28.07                            26.98
+# mmlu_pro_math              736233     accuracy       gen                             53.59                            37.53
+# mmlu_pro_philosophy        736233     accuracy       gen                             42.28                            42.48
+# mmlu_pro_physics           736233     accuracy       gen                             39.11                            33.64
+# mmlu_pro_psychology        736233     accuracy       gen                             60.90                            59.65
+# mmlu_pro_other             736233     accuracy       gen                             47.40                            46.32
--- a/examples/eval_mmlu_with_zero_retriever_overwritten.py
+++ b/examples/eval_mmlu_with_zero_retriever_overwritten.py
+from copy import deepcopy
+
+from mmengine.config import read_base
+
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+with read_base():
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \
+        mmlu_datasets  # this is a dataset evaluated with 5-shot
+    from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
+
+datasets = []
+for d in mmlu_datasets:
+    d = deepcopy(d)
+    d['infer_cfg']['retriever'] = dict(type=ZeroRetriever)
+    datasets.append(d)