Initial commit

be3dfa50 · jerrrrry · be3dfa50 · be3dfa50 · be3dfa50 · be3dfa50
Commit be3dfa50 authored Aug 06, 2025 by jerrrrry
20 changed files
--- a/examples/eval_chat_agent.py
+++ b/examples/eval_chat_agent.py
+from lagent import ReAct
+from lagent.agents.react import ReActProtocol
+from mmengine.config import read_base
+
+from opencompass.lagent.actions.python_interpreter import PythonInterpreter
+from opencompass.models.lagent import LagentAgent
+from opencompass.models.openai_api import OpenAI
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.gsm8k.gsm8k_agent_gen_be1606 import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.math.math_agent_gen_af2293 import \
+        math_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_agent_gen_568903 import \
+        mathbench_agent_datasets
+    from opencompass.configs.summarizers.math_agent import summarizer
+
+datasets = []
+datasets += gsm8k_datasets
+datasets += math_datasets
+datasets += mathbench_agent_datasets
+
+system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
+```
+def solution():
+    variable_names_with_real_meaning = func(variable)
+    return variable_names_with_real_meaning
+```"""
+
+protocol = dict(
+    type=ReActProtocol,
+    action=dict(role='ACTION', begin='Tool:', end='\n'),
+    action_input=dict(role='ARGS', begin='Tool Input:', end='\n'),
+    finish=dict(role='FINISH', begin='FinalAnswer:', end='\n'),
+    call_protocol=system_prompt,
+)
+
+models = [
+    dict(
+        abbr='gpt-3.5-react',
+        type=LagentAgent,
+        agent_type=ReAct,
+        max_turn=3,
+        llm=dict(
+            type=OpenAI,
+            path='gpt-3.5-turbo',
+            key='ENV',
+            query_per_second=1,
+            max_seq_len=4096,
+        ),
+        actions=[
+            dict(type=PythonInterpreter),
+        ],
+        protocol=protocol,
+        batch_size=1,
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=1000),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_chat_agent_baseline.py
+++ b/examples/eval_chat_agent_baseline.py
+from mmengine.config import read_base
+
+from opencompass.models.openai_api import OpenAI
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_d6de81 import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.math.math_gen_1ed9c2 import math_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_gen import \
+        mathbench_datasets
+    from opencompass.configs.summarizers.math_baseline import summarizer
+
+datasets = []
+datasets += gsm8k_datasets
+datasets += math_datasets
+datasets += mathbench_datasets
+
+models = [
+    dict(
+        abbr='gpt-3.5-react',
+        type=OpenAI,
+        path='gpt-3.5-turbo',
+        key='ENV',
+        query_per_second=1,
+        max_seq_len=4096,
+        batch_size=1,
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=1000),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_chat_demo.py
+++ b/examples/eval_chat_demo.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.demo.demo_math_chat_gen import \
+        math_datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_1_8b import \
+        models as hf_internlm2_chat_1_8b_models
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
+        models as hf_qwen2_1_5b_instruct_models
+
+datasets = gsm8k_datasets + math_datasets
+models = hf_qwen2_1_5b_instruct_models + hf_internlm2_chat_1_8b_models
--- a/examples/eval_chat_last.py
+++ b/examples/eval_chat_last.py
+from mmengine.config import read_base
+
+from opencompass.models.openai_api import OpenAI
+from opencompass.openicl import ChatInferencer
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
+        gsm8k_datasets as datasets
+
+models = [
+    dict(
+        abbr='gpt-3.5',
+        type=OpenAI,
+        path='gpt-3.5-turbo',
+        key='ENV',
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
+
+for dataset in datasets:
+    # Use ChatInferencer instead of GenInferencer
+    dataset['infer_cfg']['inferencer'] = dict(type=ChatInferencer)
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=1000),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_chembench.py
+++ b/examples/eval_chembench.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.ChemBench.ChemBench_gen import \
+        chembench_datasets
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
+        models
+
+datasets = [*chembench_datasets]
+models = [*models]
+'''
+dataset                           version    metric    mode      mistral-7b-instruct-v0.2-hf
+--------------------------------  ---------  --------  ------  -----------------------------
+ChemBench_Name_Conversion         d4e6a1     accuracy  gen                             45.43
+ChemBench_Property_Prediction     d4e6a1     accuracy  gen                             47.11
+ChemBench_Mol2caption             d4e6a1     accuracy  gen                             64.21
+ChemBench_Caption2mol             d4e6a1     accuracy  gen                             35.38
+ChemBench_Product_Prediction      d4e6a1     accuracy  gen                             38.67
+ChemBench_Retrosynthesis          d4e6a1     accuracy  gen                             27
+ChemBench_Yield_Prediction        d4e6a1     accuracy  gen                             27
+ChemBench_Temperature_Prediction  d4e6a1     accuracy  gen                             26.73
+ChemBench_Solvent_Prediction      d4e6a1     accuracy  gen                             32.67
+'''
--- a/examples/eval_chinese_simpleqa.py
+++ b/examples/eval_chinese_simpleqa.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import csimpleqa_datasets
+
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.models.openai_api import OpenAI
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.summarizers import DefaultSubjectiveSummarizer
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+# -------------Inference Stage ----------------------------------------
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='Qwen2.5-1.5B-Instruct',
+        path='Qwen/Qwen2.5-1.5B-Instruct',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        generation_kwargs=dict(do_sample=True, ),
+        max_out_len=200,
+        max_seq_len=4096,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
+
+datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
+summarizer = dict(type=DefaultSubjectiveSummarizer)
+
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
+
+api_meta_template = dict(round=[
+    dict(role='SYSTEM', api_role='SYSTEM'),
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+judge_models = [
+    dict(
+        # GPT4o
+        abbr='gpt-4o-0513-global',
+        type=OpenAI,
+        # gpt-4o
+        path='gpt-4o-0513-global',
+        key='xxx',  # provide OPENAI_API_KEY
+        meta_template=api_meta_template,
+        query_per_second=16,
+        max_out_len=1000,
+        batch_size=8,
+        retry=3)
+]
+
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(type=SubjectiveNaivePartitioner,
+                     models=models,
+                     judge_models=judge_models),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=SubjectiveEvalTask)),
+)
+
+work_dir = 'outputs/chinese_simpleqa/'
--- a/examples/eval_cibench.py
+++ b/examples/eval_cibench.py
+from copy import deepcopy
+
+from lagent import ReAct
+from lagent.agents.react import ReActProtocol
+from mmengine.config import read_base
+
+from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
+from opencompass.lagent.actions.python_interpreter import PythonInterpreter
+from opencompass.lagent.agents.react import CIReAct
+from opencompass.models import HuggingFaceCausalLM
+from opencompass.models.lagent import CodeAgent, LagentAgent
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    # Note that it might occur cuda OOM error for hf model
+    from opencompass.configs.datasets.CIBench.CIBench_generation_gen_8ab0dc import \
+        cibench_datasets as cibench_datasets_generation
+    from opencompass.configs.datasets.CIBench.CIBench_template_gen_e6b12a import \
+        cibench_datasets as cibench_datasets_template
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
+        models as lmdeploy_llama3_8b_instruct_model
+    from opencompass.configs.summarizers.cibench import summarizer
+
+    # Oracle mode for analysis
+    # from opencompass.configs.datasets.CIBench.CIBench_template_oracle_gen_fecda1 import cibench_datasets as cibench_datasets_template_oracle
+    # from opencompass.configs.datasets.CIBench.CIBench_generation_oracle_gen_c4a7c1 import cibench_datasets as cibench_datasets_generation_oracle
+
+datasets = []
+datasets += cibench_datasets_template
+datasets += cibench_datasets_generation
+# datasets += cibench_datasets_template_oracle
+# datasets += cibench_datasets_generation_oracle
+
+_origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
+                     [])
+
+FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
+
+FEWSHOT_INSTRUCTION = """\
+You are an assistant who can utilize external tools.
+{tool_description}
+To use a tool, please response with the following format:
+```
+{thought} Think what you need to solve, do you need to use tools?
+{action} The tool name, should be one of [{action_names}].
+{action_input} The input to the tool that you want to use.
+```
+The tool will give you response after your response using the following format:
+```
+{response} the results after call the tool.
+```
+Therefore DO NOT generate tool response by yourself.
+
+Also please follow the guidelines:
+1. Always use code interpreter to solve the problem.
+2. The generated codes should always in a markdown code block format.
+3. The generated codes will be executed in an ipython manner and the results will be cached.
+4. Your responded code should always be simple and only solves the problem in current step.
+
+For example:
+
+File url: `xxxx`
+### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
+
+{thought} We should use `pandas` to solve this step.
+{action} IPythonInterpreter
+{action_input} ```python
+import pandas as pd
+url = "xxxx"
+data = pd.read_csv(url)
+```
+{response} The code is succeed without any outputs.
+
+Let us begin from here!
+"""
+
+IPYTHON_INTERPRETER_DESCRIPTION = '''\
+It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
+
+actions = [
+    dict(type=IPythonInterpreter,
+         user_data_dir='./data/cibench_dataset/datasources',
+         description=IPYTHON_INTERPRETER_DESCRIPTION)
+]
+protocol = dict(
+    type=ReActProtocol,
+    call_protocol=FEWSHOT_INSTRUCTION,
+    force_stop=FORCE_STOP_PROMPT_EN,
+    finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
+)
+
+work_dir = './outputs/cibench/'
+
+_agent_models = []
+for m in _origin_models:
+    m = deepcopy(m)
+    if 'meta_template' in m and 'round' in m['meta_template']:
+        round = m['meta_template']['round']
+        if all(r['role'].upper() != 'SYSTEM'
+               for r in round):  # no system round
+            if not any('api_role' in r for r in round):
+                m['meta_template']['round'].append(
+                    dict(role='system', begin='System response:', end='\n'))
+            else:
+                m['meta_template']['round'].append(
+                    dict(role='system', api_role='SYSTEM'))
+            print(
+                f'WARNING: adding SYSTEM round in meta_template for {m.get("abbr", None)}'
+            )
+    _agent_models.append(m)
+
+protocol = dict(
+    type=ReActProtocol,
+    call_protocol=FEWSHOT_INSTRUCTION,
+    force_stop=FORCE_STOP_PROMPT_EN,
+    finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
+)
+
+models = []
+for m in _agent_models:
+    m = deepcopy(m)
+    origin_abbr = m.pop('abbr')
+    abbr = origin_abbr
+    m.pop('batch_size', None)
+    m.pop('max_out_len', None)
+    m.pop('max_seq_len', None)
+    run_cfg = m.pop('run_cfg', {})
+
+    agent_model = dict(
+        abbr=abbr,
+        summarizer_abbr=origin_abbr,
+        type=CodeAgent,
+        agent_type=CIReAct,
+        max_turn=3,
+        llm=m,
+        actions=[
+            dict(type=IPythonInterpreter,
+                 user_data_dir='./data/cibench_dataset/datasources',
+                 description=IPYTHON_INTERPRETER_DESCRIPTION)
+        ],
+        protocol=protocol,
+        batch_size=1,
+        run_cfg=run_cfg,
+    )
+    models.append(agent_model)
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=4,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_cibench_api.py
+++ b/examples/eval_cibench_api.py
+from lagent.agents.react import ReActProtocol
+from mmengine.config import read_base
+
+from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
+from opencompass.lagent.agents.react import CIReAct
+from opencompass.models import OpenAI
+from opencompass.models.lagent import CodeAgent
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.CIBench.CIBench_generation_gen_8ab0dc import \
+        cibench_datasets as cibench_datasets_generation
+    from opencompass.configs.datasets.CIBench.CIBench_template_gen_e6b12a import \
+        cibench_datasets as cibench_datasets_template
+    # Oracle mode for analysis
+    # from opencompass.configs.datasets.CIBench.CIBench_template_oracle_gen_fecda1 import cibench_datasets as cibench_datasets_template_oracle
+    # from opencompass.configs.datasets.CIBench.CIBench_generation_oracle_gen_c4a7c1 import cibench_datasets as cibench_datasets_generation_oracle
+    from opencompass.configs.summarizers.cibench import summarizer
+
+datasets = []
+datasets += cibench_datasets_template
+datasets += cibench_datasets_generation
+# datasets += cibench_datasets_template_oracle
+# datasets += cibench_datasets_generation_oracle
+
+FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
+
+FEWSHOT_INSTRUCTION = """\
+You are an assistant who can utilize external tools.
+{tool_description}
+To use a tool, please response with the following format:
+```
+{thought} Think what you need to solve, do you need to use tools?
+{action} The tool name, should be one of [{action_names}].
+{action_input} The input to the tool that you want to use.
+```
+The tool will give you response after your response using the following format:
+```
+{response} the results after call the tool.
+```
+Therefore DO NOT generate tool response by yourself.
+
+Also please follow the guidelines:
+1. Always use code interpreter to solve the problem.
+2. The generated codes should always in a markdown code block format.
+3. The generated codes will be executed in an ipython manner and the results will be cached.
+4. Your responded code should always be simple and only solves the problem in current step.
+
+For example:
+
+File url: `xxxx`
+### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
+
+{thought} We should use `pandas` to solve this step.
+{action} IPythonInterpreter
+{action_input} ```python
+import pandas as pd
+url = "xxxx"
+data = pd.read_csv(url)
+```
+{response} The code is succeed without any outputs.
+
+Let us begin from here!
+"""
+
+IPYTHON_INTERPRETER_DESCRIPTION = '''\
+It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+    dict(role='SYSTEM', api_role='SYSTEM'),
+], )
+
+actions = [
+    dict(type=IPythonInterpreter,
+         user_data_dir='./data/cibench_dataset/datasources',
+         description=IPYTHON_INTERPRETER_DESCRIPTION)
+]
+protocol = dict(
+    type=ReActProtocol,
+    call_protocol=FEWSHOT_INSTRUCTION,
+    force_stop=FORCE_STOP_PROMPT_EN,
+    finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
+)
+
+work_dir = 'outputs/cibench/'
+models = [
+    dict(
+        abbr='gpt-4o',
+        type=CodeAgent,
+        agent_type=CIReAct,
+        max_turn=3,
+        llm=dict(
+            type=OpenAI,
+            path='gpt-4o',
+            rpm_verbose=True,
+            retry=99,
+            meta_template=api_meta_template,
+            query_per_second=1,
+            max_seq_len=2048,
+            temperature=0,
+        ),
+        actions=actions,
+        protocol=protocol,
+        batch_size=1,
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=4,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_circular.py
+++ b/examples/eval_circular.py
+from mmengine.config import read_base
+
+from opencompass.datasets.circular import (
+    CircularARCDataset, CircularCEvalDataset, CircularCMMLUDataset,
+    CircularCSQADataset, CircularEvaluator, CircularHSWAGDataset,
+    CircularMMLUDataset, CircularOBQADataset, CircularRaceDataset)
+from opencompass.summarizers import CircularSummarizer
+
+with read_base():
+    from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import \
+        ARC_c_datasets
+    from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import \
+        ARC_e_datasets
+    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
+        ceval_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import \
+        cmmlu_datasets
+    from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_1da2d0 import \
+        commonsenseqa_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import \
+        hellaswag_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
+    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm_chat_7b import \
+        models as hf_internlm_chat_7b_model
+    from opencompass.configs.models.hf_internlm.hf_internlm_chat_20b import \
+        models as hf_internlm_chat_20b_model
+    from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
+        models as hf_qwen_7b_chat_model
+    from opencompass.configs.models.qwen.hf_qwen_14b_chat import \
+        models as hf_qwen_14b_chat_model
+    from opencompass.configs.summarizers.groups.ceval import \
+        ceval_summary_groups
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups
+    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
+
+for ds, t in [
+    (ceval_datasets, CircularCEvalDataset),
+    (mmlu_datasets, CircularMMLUDataset),
+    (cmmlu_datasets, CircularCMMLUDataset),
+    (hellaswag_datasets, CircularHSWAGDataset),
+    (ARC_e_datasets, CircularARCDataset),
+    (ARC_c_datasets, CircularARCDataset),
+    (commonsenseqa_datasets, CircularCSQADataset),
+    (obqa_datasets, CircularOBQADataset),
+    (race_datasets, CircularRaceDataset),
+]:
+    for d in ds:
+        d['type'] = t
+        d['abbr'] = d['abbr'] + '-circular-4'
+        d['eval_cfg']['evaluator'] = {
+            'type': CircularEvaluator,
+            'circular_pattern': 'circular'
+        }
+        d['circular_patterns'] = 'circular'
+
+datasets = sum([
+    v
+    for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
+], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+# config summarizer
+other_summary_groups = [
+    {
+        'name':
+        'average',
+        'subsets': [
+            'ceval', 'mmlu', 'cmmlu', 'hellaswag', 'ARC-e', 'ARC-c',
+            'commonsense_qa', 'openbookqa_fact', 'race-middle', 'race-high'
+        ]
+    },
+]
+origin_summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
+new_summary_groups = []
+for item in origin_summary_groups:
+    new_summary_groups.append({
+        'name':
+        item['name'] + '-circular-4',
+        'subsets': [i + '-circular-4' for i in item['subsets']],
+    })
+summarizer = dict(
+    type=CircularSummarizer,
+    metric_types=['acc_origin', 'perf_circular'],
+    dataset_abbrs=[
+        'average-circular-4',
+        'ceval-circular-4',
+        'mmlu-circular-4',
+        'cmmlu-circular-4',
+        'hellaswag-circular-4',
+        'ARC-e-circular-4',
+        'ARC-c-circular-4',
+        'commonsense_qa-circular-4',
+        'openbookqa_fact-circular-4',
+        'race-middle-circular-4',
+        'race-high-circular-4',
+        'ceval-humanities-circular-4',
+        'ceval-stem-circular-4',
+        'ceval-social-science-circular-4',
+        'ceval-other-circular-4',
+        'mmlu-humanities-circular-4',
+        'mmlu-stem-circular-4',
+        'mmlu-social-science-circular-4',
+        'mmlu-other-circular-4',
+        'cmmlu-humanities-circular-4',
+        'cmmlu-stem-circular-4',
+        'cmmlu-social-science-circular-4',
+        'cmmlu-other-circular-4',
+        'cmmlu-china-specific-circular-4',
+    ],
+    summary_groups=new_summary_groups,
+)
--- a/examples/eval_claude.py
+++ b/examples/eval_claude.py
+from mmengine.config import read_base
+
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.collections.chat_medium import datasets
+    from opencompass.configs.models.claude.claude import models
+    # and output the results in a choosen format
+    from opencompass.configs.summarizers.medium import summarizer
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=8,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_code_passk.py
+++ b/examples/eval_code_passk.py
+# This config is used for pass@k evaluation with `num_return_sequences`
+# That model can generate multiple responses for single input
+from mmengine.config import read_base
+
+from opencompass.models import HuggingFaceCausalLM
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.humaneval.humaneval_passk_gen_8e312c import \
+        humaneval_datasets
+    from opencompass.configs.datasets.mbpp.deprecated_mbpp_passk_gen_1e1056 import \
+        mbpp_datasets
+    from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_passk_gen_1e1056 import \
+        sanitized_mbpp_datasets
+
+datasets = []
+datasets += humaneval_datasets
+datasets += mbpp_datasets
+datasets += sanitized_mbpp_datasets
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='CodeLlama-7b-Python',
+        path='codellama/CodeLlama-7b-Python-hf',
+        tokenizer_path='codellama/CodeLlama-7b-Python-hf',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        generation_kwargs=dict(
+            num_return_sequences=10,
+            do_sample=True,
+            top_p=0.95,
+            temperature=0.8,
+        ),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=300),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_code_passk_repeat_dataset.py
+++ b/examples/eval_code_passk_repeat_dataset.py
+# This config is used for pass@k evaluation with dataset repetition
+# That model cannot generate multiple response for single input
+from mmengine.config import read_base
+
+from opencompass.models import HuggingFaceCausalLM
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import \
+        humaneval_datasets
+    from opencompass.configs.datasets.mbpp.deprecated_mbpp_repeat10_gen_1e1056 import \
+        mbpp_datasets
+    from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_repeat10_gen_1e1056 import \
+        sanitized_mbpp_datasets
+
+datasets = []
+datasets += humaneval_datasets
+datasets += mbpp_datasets
+datasets += sanitized_mbpp_datasets
+
+_meta_template = dict(round=[
+    dict(role='HUMAN', begin='<|User|>:', end='\n'),
+    dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
+], )
+
+models = [
+    dict(
+        abbr='internlm-chat-7b-hf-v11',
+        type=HuggingFaceCausalLM,
+        path='internlm/internlm-chat-7b-v1_1',
+        tokenizer_path='internlm/internlm-chat-7b-v1_1',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            use_fast=False,
+            trust_remote_code=True,
+        ),
+        max_seq_len=2048,
+        meta_template=_meta_template,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        generation_kwargs=dict(
+            do_sample=True,
+            top_p=0.95,
+            temperature=0.8,
+        ),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        batch_size=8,
+    )
+]
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=600),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_codeagent.py
+++ b/examples/eval_codeagent.py
+from mmengine.config import read_base
+
+from opencompass.models import HuggingFaceCausalLM, OpenAI
+from opencompass.models.lagent import CodeAgent
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_57b0b1 import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.math.math_gen_943d32 import math_datasets
+
+datasets = []
+datasets += gsm8k_datasets
+datasets += math_datasets
+
+models = [
+    dict(abbr='gpt-3.5-react',
+         type=CodeAgent,
+         llm=dict(
+             type=OpenAI,
+             path='gpt-3.5-turbo',
+             key='ENV',
+             query_per_second=1,
+             max_seq_len=4096,
+         ),
+         batch_size=8),
+    dict(abbr='WizardCoder-Python-13B-V1.0-react',
+         type=CodeAgent,
+         llm=dict(
+             type=HuggingFaceCausalLM,
+             path='WizardLM/WizardCoder-Python-13B-V1.0',
+             tokenizer_path='WizardLM/WizardCoder-Python-13B-V1.0',
+             tokenizer_kwargs=dict(
+                 padding_side='left',
+                 truncation_side='left',
+                 trust_remote_code=True,
+             ),
+             max_seq_len=2048,
+             model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+         ),
+         batch_size=8,
+         run_cfg=dict(num_gpus=2, num_procs=1)),
+]
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=40000),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_codegeex2.py
+++ b/examples/eval_codegeex2.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.humanevalx.humanevalx_gen import \
+        humanevalx_datasets
+    from opencompass.configs.models.codegeex2.hf_codegeex2_6b import models
+
+datasets = humanevalx_datasets
--- a/examples/eval_compassarena_subjectivebench.py
+++ b/examples/eval_compassarena_subjectivebench.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.singleturn.pairwise_judge import compassarena_subjectivebench_singleturn_datasets
+    from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.multiturn.pairwise_judge import compassarena_subjectivebench_multiturn_datasets
+
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as lmdeploy_internlm2_5_7b_chat
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import models as lmdeploy_internlm2_5_20b_chat
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import models as lmdeploy_llama3_1_8b_instruct
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_70b_instruct import models as lmdeploy_llama3_1_70b_instruct
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import models as lmdeploy_qwen2_5_0_5b_instruct
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b_instruct import models as lmdeploy_qwen2_5_1_5b_instruct
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import models as lmdeploy_qwen2_5_3b_instruct
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import models as lmdeploy_qwen2_5_14b_instruct
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import models as lmdeploy_qwen2_5_32b_instruct
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import models as lmdeploy_qwen2_7b_instruct
+
+from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
+                                HuggingFaceChatGLM3, OpenAI,
+                                TurboMindModelwithChatTemplate)
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_num_worker import \
+    SubjectiveNumWorkerPartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.summarizers import DefaultSubjectiveSummarizer
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+# models = [
+#     dict(
+#         type=TurboMindModelwithChatTemplate,
+#         abbr='CompassJudger-1-7B-Instruct',
+#         path='opencompass/CompassJudger-1-7B-Instruct',
+#         engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+#         gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
+#         max_seq_len=16384,
+#         max_out_len=2048,
+#         batch_size=16,
+#         run_cfg=dict(num_gpus=1),
+#     )
+# ]
+
+models = [
+    *lmdeploy_qwen2_5_14b_instruct, *lmdeploy_qwen2_5_32b_instruct,
+    *lmdeploy_qwen2_5_7b_instruct, *lmdeploy_qwen2_7b_instruct
+]
+
+datasets = [
+    *compassarena_subjectivebench_singleturn_datasets,
+    *compassarena_subjectivebench_multiturn_datasets
+]  # add datasets you want
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
+judge_models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='CompassJudger-1-32B-Instruct',
+        path='opencompass/CompassJudger-1-32B-Instruct',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
+
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=SubjectiveEvalTask)),
+)
+
+summarizer = dict(type=DefaultSubjectiveSummarizer, )
+work_dir = 'outputs/subjective/'
--- a/examples/eval_compassarena_subjectivebench_bradleyterry.py
+++ b/examples/eval_compassarena_subjectivebench_bradleyterry.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.singleturn.pairwise_bt_judge import (
+        compassarena_subjectivebench_bradleyterry_singleturn_datasets, )
+    from opencompass.configs.datasets.subjective.compass_arena_subjective_bench.multiturn.pairwise_bt_judge import (
+        compassarena_subjectivebench_bradleyterry_multiturn_datasets, )
+
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
+        models as lmdeploy_internlm2_5_7b_chat, )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
+        models as lmdeploy_internlm2_5_20b_chat, )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import (
+        models as lmdeploy_llama3_1_8b_instruct, )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_70b_instruct import (
+        models as lmdeploy_llama3_1_70b_instruct, )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import (
+        models as lmdeploy_qwen2_5_0_5b_instruct, )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b_instruct import (
+        models as lmdeploy_qwen2_5_1_5b_instruct, )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import (
+        models as lmdeploy_qwen2_5_3b_instruct, )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct, )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as lmdeploy_qwen2_5_14b_instruct, )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import (
+        models as lmdeploy_qwen2_5_32b_instruct, )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import (
+        models as lmdeploy_qwen2_5_72b_instruct, )
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
+        models as lmdeploy_qwen2_7b_instruct, )
+
+from opencompass.models import (HuggingFace, HuggingFaceCausalLM,
+                                HuggingFaceChatGLM3, OpenAI,
+                                TurboMindModelwithChatTemplate)
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_num_worker import \
+    SubjectiveNumWorkerPartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.summarizers import CompassArenaBradleyTerrySummarizer
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+
+# -------------Inference Stage ----------------------------------------
+models = [
+    *lmdeploy_qwen2_5_14b_instruct,
+    *lmdeploy_qwen2_5_32b_instruct,
+    *lmdeploy_qwen2_5_7b_instruct,
+    *lmdeploy_qwen2_7b_instruct,
+]
+
+datasets = [
+    *compassarena_subjectivebench_bradleyterry_singleturn_datasets,
+    *compassarena_subjectivebench_bradleyterry_multiturn_datasets,
+]
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
+judge_models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='CompassJudger-1-32B-Instruct',
+        path='opencompass/CompassJudger-1-32B-Instruct',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+    )
+]
+
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=SubjectiveEvalTask)),
+)
+
+## ------------- Summary Configuration
+# This step fits a Bradley-Terry model (statistical model) with an option
+# to include style features and control variables based on groups
+# (group variables must be available in the input dataset for each observation).
+summarizer = dict(
+    type=CompassArenaBradleyTerrySummarizer,
+    rating_system='bradleyterry',
+    report_pred_win_rates=True,
+    num_bootstrap=100,
+    num_cpu=None,
+    with_control_vars=True,
+    normalize_style_features=False,
+    odds_ratio=True,
+    groups=['difficulty', 'category'],
+)
+
+work_dir = 'outputs/compassarena_subjectivebench_bradleyterry/'
--- a/examples/eval_contamination.py
+++ b/examples/eval_contamination.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.ARC_c.ARC_c_clean_ppl import \
+        ARC_c_datasets
+    from opencompass.configs.datasets.ceval.ceval_clean_ppl import \
+        ceval_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_clean_ppl import \
+        hellaswag_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_clean_ppl import mmlu_datasets
+    from opencompass.configs.models.hf_llama.hf_llama2_7b import \
+        models as hf_llama2_7b_model
+    from opencompass.configs.models.qwen.hf_qwen_7b import \
+        models as hf_qwen_7b_model
+    from opencompass.configs.models.yi.hf_yi_6b import models as hf_yi_6b_model
+    from opencompass.configs.summarizers.contamination import summarizer
+
+datasets = [
+    *ceval_datasets, *mmlu_datasets, *hellaswag_datasets, *ARC_c_datasets
+]
+models = [*hf_yi_6b_model, *hf_qwen_7b_model, *hf_llama2_7b_model]
--- a/examples/eval_corebench_2409_base_objective.py
+++ b/examples/eval_corebench_2409_base_objective.py
+import os.path as osp
+
+from mmengine.config import read_base
+
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets Part
+    ## Core Set
+    # ## Examination
+    # ## Reasoning
+    from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import bbh_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
+        cmmlu_datasets
+    from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets
+    # ## Scientific
+    from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_2c9cd6 import \
+        gpqa_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
+        hellaswag_datasets
+    # ## Coding
+    from opencompass.configs.datasets.humaneval.deprecated_humaneval_gen_d2537e import \
+        humaneval_datasets
+    # ## Math
+    from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
+        math_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
+        mathbench_datasets
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
+        sanitized_mbpp_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
+        mmlu_pro_datasets
+    # Model List
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
+        models as lmdeploy_qwen2_5_1_5b_model
+    from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
+        mathbench_2024_summary_groups
+    # TODO: Add LiveCodeBench
+    # ## Instruction Following
+    # from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
+    # Summarizer
+    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups
+
+    # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
+    # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
+    # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+# with read_base():
+
+core_summary_groups = [
+    {
+        'name':
+        'core_average',
+        'subsets': [['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'],
+                    ['cmmlu', 'accuracy'], ['bbh', 'naive_average'],
+                    ['hellaswag', 'accuracy'], ['drop', 'accuracy'],
+                    ['math', 'accuracy'], ['gsm8k', 'accuracy'],
+                    ['mathbench-t (average)', 'naive_average'],
+                    ['GPQA_diamond', 'accuracy'],
+                    ['openai_humaneval', 'humaneval_pass@1'],
+                    ['IFEval', 'Prompt-level-strict-accuracy'],
+                    ['sanitized_mbpp', 'score'],
+                    ['mathbench-t (average)', 'naive_average']],
+    },
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['mmlu', 'accuracy'],
+        ['mmlu_pro', 'accuracy'],
+        ['cmmlu', 'accuracy'],
+        ['bbh', 'naive_average'],
+        ['hellaswag', 'accuracy'],
+        ['drop', 'accuracy'],
+        ['math', 'accuracy'],
+        ['gsm8k', 'accuracy'],
+        ['mathbench-t (average)', 'naive_average'],
+        ['GPQA_diamond', 'accuracy'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        ['sanitized_mbpp', 'score'],
+        'mathbench-a (average)',
+        'mathbench-t (average)'
+        '',
+        ['mmlu', 'accuracy'],
+        ['mmlu-stem', 'accuracy'],
+        ['mmlu-social-science', 'accuracy'],
+        ['mmlu-humanities', 'accuracy'],
+        ['mmlu-other', 'accuracy'],
+        '',
+        ['mmlu_pro', 'accuracy'],
+        ['mmlu_pro_math', 'accuracy'],
+        ['mmlu_pro_physics', 'accuracy'],
+        ['mmlu_pro_chemistry', 'accuracy'],
+        ['mmlu_pro_law', 'accuracy'],
+        ['mmlu_pro_engineering', 'accuracy'],
+        ['mmlu_pro_other', 'accuracy'],
+        ['mmlu_pro_economics', 'accuracy'],
+        ['mmlu_pro_health', 'accuracy'],
+        ['mmlu_pro_psychology', 'accuracy'],
+        ['mmlu_pro_business', 'accuracy'],
+        ['mmlu_pro_biology', 'accuracy'],
+        ['mmlu_pro_philosophy', 'accuracy'],
+        ['mmlu_pro_computer_science', 'accuracy'],
+        ['mmlu_pro_history', 'accuracy'],
+        '',
+        ['cmmlu', 'accuracy'],
+        ['cmmlu-stem', 'accuracy'],
+        ['cmmlu-social-science', 'accuracy'],
+        ['cmmlu-humanities', 'accuracy'],
+        ['cmmlu-other', 'accuracy'],
+        ['cmmlu-china-specific', 'accuracy'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0,  # Modify if needed
+        task=dict(type=OpenICLInferTask)),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLEvalTask)),
+)
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench_2409_objective/'
+work_dir = osp.join(base_exp_dir, 'base_objective')
--- a/examples/eval_corebench_2409_chat_objective.py
+++ b/examples/eval_corebench_2409_chat_objective.py
+import os.path as osp
+
+from mmengine.config import read_base
+
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets Part
+    ## Core Set
+    # ## Examination
+    # ## Reasoning
+    from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
+        cmmlu_datasets
+    from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
+        drop_datasets
+    # ## Scientific
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
+        gpqa_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
+        hellaswag_datasets
+    # ## Coding
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
+        humaneval_datasets
+    # TODO: Add LiveCodeBench
+    # ## Instruction Following
+    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
+        ifeval_datasets
+    # ## Math
+    from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
+        math_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
+        mathbench_datasets
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
+        sanitized_mbpp_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
+        mmlu_datasets
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
+        mmlu_pro_datasets
+    from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups
+    # Summarizer
+    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups
+
+    # Model List
+    # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model
+    # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model
+    # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+# with read_base():
+
+core_summary_groups = [
+    {
+        'name':
+        'core_average',
+        'subsets': [['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'],
+                    ['cmmlu', 'accuracy'], ['bbh', 'score'],
+                    ['math', 'accuracy'],
+                    ['openai_humaneval', 'humaneval_pass@1'],
+                    ['GPQA_diamond', 'accuracy'],
+                    ['IFEval', 'Prompt-level-strict-accuracy'],
+                    ['drop', 'accuracy'], ['sanitized_mbpp', 'score'],
+                    ['gsm8k', 'accuracy'], ['hellaswag', 'accuracy'],
+                    ['mathbench-t (average)', 'naive_average']],
+    },
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['core_average', 'naive_average'],
+        ['mmlu', 'accuracy'],
+        ['mmlu_pro', 'accuracy'],
+        ['cmmlu', 'accuracy'],
+        ['bbh', 'score'],
+        ['math', 'accuracy'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['GPQA_diamond', 'accuracy'],
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        ['drop', 'accuracy'],
+        ['sanitized_mbpp', 'score'],
+        ['gsm8k', 'accuracy'],
+        ['hellaswag', 'accuracy'],
+        'mathbench-a (average)',
+        'mathbench-t (average)'
+        '',
+        ['mmlu', 'accuracy'],
+        ['mmlu-stem', 'accuracy'],
+        ['mmlu-social-science', 'accuracy'],
+        ['mmlu-humanities', 'accuracy'],
+        ['mmlu-other', 'accuracy'],
+        '',
+        ['mmlu_pro', 'accuracy'],
+        ['mmlu_pro_math', 'accuracy'],
+        ['mmlu_pro_physics', 'accuracy'],
+        ['mmlu_pro_chemistry', 'accuracy'],
+        ['mmlu_pro_law', 'accuracy'],
+        ['mmlu_pro_engineering', 'accuracy'],
+        ['mmlu_pro_other', 'accuracy'],
+        ['mmlu_pro_economics', 'accuracy'],
+        ['mmlu_pro_health', 'accuracy'],
+        ['mmlu_pro_psychology', 'accuracy'],
+        ['mmlu_pro_business', 'accuracy'],
+        ['mmlu_pro_biology', 'accuracy'],
+        ['mmlu_pro_philosophy', 'accuracy'],
+        ['mmlu_pro_computer_science', 'accuracy'],
+        ['mmlu_pro_history', 'accuracy'],
+        '',
+        ['cmmlu', 'accuracy'],
+        ['cmmlu-stem', 'accuracy'],
+        ['cmmlu-social-science', 'accuracy'],
+        ['cmmlu-humanities', 'accuracy'],
+        ['cmmlu-other', 'accuracy'],
+        ['cmmlu-china-specific', 'accuracy'],
+        '',
+        ['bbh', 'extract_rate'],
+        ['math', 'extract_rate'],
+        # ['openai_humaneval', 'extract_rate'],
+        ['GPQA_diamond', 'extract_rate'],
+        # ['IFEval', 'extract_rate'],
+        '',
+        ['mmlu', 'extract_rate'],
+        ['mmlu-stem', 'extract_rate'],
+        ['mmlu-social-science', 'extract_rate'],
+        ['mmlu-humanities', 'extract_rate'],
+        ['mmlu-other', 'extract_rate'],
+        '',
+        ['mmlu_pro', 'extract_rate'],
+        ['mmlu_pro_math', 'extract_rate'],
+        ['mmlu_pro_physics', 'extract_rate'],
+        ['mmlu_pro_chemistry', 'extract_rate'],
+        ['mmlu_pro_law', 'extract_rate'],
+        ['mmlu_pro_engineering', 'extract_rate'],
+        ['mmlu_pro_other', 'extract_rate'],
+        ['mmlu_pro_economics', 'extract_rate'],
+        ['mmlu_pro_health', 'extract_rate'],
+        ['mmlu_pro_psychology', 'extract_rate'],
+        ['mmlu_pro_business', 'extract_rate'],
+        ['mmlu_pro_biology', 'extract_rate'],
+        ['mmlu_pro_philosophy', 'extract_rate'],
+        ['mmlu_pro_computer_science', 'extract_rate'],
+        ['mmlu_pro_history', 'extract_rate'],
+        '',
+        ['cmmlu', 'extract_rate'],
+        ['cmmlu-stem', 'extract_rate'],
+        ['cmmlu-social-science', 'extract_rate'],
+        ['cmmlu-humanities', 'extract_rate'],
+        ['cmmlu-other', 'extract_rate'],
+        ['cmmlu-china-specific', 'extract_rate'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0,  # Modify if needed
+        task=dict(type=OpenICLInferTask)),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLEvalTask)),
+)
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench_2409_objective/'
+work_dir = osp.join(base_exp_dir, 'chat_objective')
--- a/examples/eval_corebench_2409_longcontext.py
+++ b/examples/eval_corebench_2409_longcontext.py
+import os.path as osp
+from copy import deepcopy
+
+from mmengine.config import read_base
+
+from opencompass.models import (HuggingFacewithChatTemplate,
+                                TurboMindModelwithChatTemplate)
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import DLCRunner, LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    from opencompass.configs.datasets.longbench.longbench import \
+        longbench_datasets
+    from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import \
+        needlebench_datasets as needlebench_8k_datasets
+    from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import \
+        needlebench_datasets as needlebench_32k_datasets
+    from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \
+        needlebench_datasets as needlebench_128k_datasets
+    from opencompass.configs.datasets.ruler.ruler_8k_gen import \
+        ruler_datasets as ruler_8k_datasets
+    from opencompass.configs.datasets.ruler.ruler_32k_gen import \
+        ruler_datasets as ruler_32k_datasets
+    from opencompass.configs.datasets.ruler.ruler_128k_gen import \
+        ruler_datasets as ruler_128k_datasets
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
+        models as lmdeploy_internlm2_5_7b_1m_chat_model
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
+        models as llama3_1_8b_instruct_model
+    # Instruct models
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
+        models as lmdeploy_qwen2_7b_instruct_model
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.longbench import \
+        longbench_summary_groups
+    from opencompass.configs.summarizers.groups.ruler import \
+        ruler_summary_groups
+    from opencompass.configs.summarizers.needlebench import (
+        needlebench_8k_summarizer, needlebench_32k_summarizer,
+        needlebench_128k_summarizer)
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups']
+needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups']
+needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups']
+
+# Instruct models summarizer
+summarizer = dict(
+    dataset_abbrs=[
+        ['ruler_8k', 'naive_average'],
+        ['ruler_32k', 'naive_average'],
+        ['ruler_128k', 'naive_average'],
+        ['NeedleBench-Overall-Score-8K', 'weighted_average'],
+        ['NeedleBench-Overall-Score-32K', 'weighted_average'],
+        ['NeedleBench-Overall-Score-128K', 'weighted_average'],
+        ['longbench', 'naive_average'],
+        ['longbench_zh', 'naive_average'],
+        ['longbench_en', 'naive_average'],
+        '',
+        'longbench_single-document-qa',
+        'longbench_multi-document-qa',
+        'longbench_summarization',
+        'longbench_few-shot-learning',
+        'longbench_synthetic-tasks',
+        'longbench_code-completion',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+lmdeploy_qwen2_7b_instruct_model[0]['max_seq_len'] = 1048576
+lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['session_len'] = 1048576
+lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['tp'] = 4
+lmdeploy_qwen2_7b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
+lmdeploy_qwen2_7b_instruct_model[0]['run_cfg']['num_gpus'] = 4
+
+llama3_1_8b_instruct_model[0]['max_seq_len'] = 1048576
+llama3_1_8b_instruct_model[0]['engine_config']['session_len'] = 1048576
+llama3_1_8b_instruct_model[0]['engine_config']['tp'] = 4
+llama3_1_8b_instruct_model[0]['engine_config']['rope_scaling_factor'] = 4
+llama3_1_8b_instruct_model[0]['run_cfg']['num_gpus'] = 4
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0,  # Modify if needed
+        task=dict(type=OpenICLInferTask)),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLEvalTask)),
+)
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+base_exp_dir = 'outputs/corebench/'
+work_dir = osp.join(base_exp_dir, 'long_context')