Initial commit

c289ecc0 · xinghao · c289ecc0 · c289ecc0 · c289ecc0 · c289ecc0
Commit c289ecc0 authored Oct 21, 2025 by xinghao
20 changed files
--- a/examples/eval_babilong.py
+++ b/examples/eval_babilong.py
+from mmengine.config import read_base
+
+with read_base():
+    # Models
+    # Datasets
+    from opencompass.configs.datasets.babilong.babilong_0k_gen import \
+        babiLong_0k_datasets
+    from opencompass.configs.datasets.babilong.babilong_4k_gen import \
+        babiLong_4k_datasets
+    from opencompass.configs.datasets.babilong.babilong_16k_gen import \
+        babiLong_16k_datasets
+    from opencompass.configs.datasets.babilong.babilong_32k_gen import \
+        babiLong_32k_datasets
+    from opencompass.configs.datasets.babilong.babilong_128k_gen import \
+        babiLong_128k_datasets
+    from opencompass.configs.datasets.babilong.babilong_256k_gen import \
+        babiLong_256k_datasets
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
+        models as lmdeploy_internlm2_5_7b_chat_model
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
+        models as lmdeploy_llama3_1_8b_instruct_model
+    from opencompass.configs.models.mistral.lmdeploy_ministral_8b_instruct_2410 import \
+        models as lmdeploy_ministral_8b_instruct_2410_model
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
+        models as lmdeploy_qwen2_5_7b_instruct_model
+    from opencompass.configs.summarizers.groups.babilong import \
+        babilong_summary_groups
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+for model in models:
+    model['engine_config']['session_len'] = 1024 * 1024
+    model['max_seq_len'] = 1024 * 1024
+    model['engine_config']['tp'] = 4
+    model['run_cfg']['num_gpus'] = 4
+
+summarizer = dict(
+    dataset_abbrs=[
+        'babilong_0k',
+        'babilong_4k',
+        'babilong_16k',
+        'babilong_32k',
+        'babilong_128k',
+        'babilong_256k',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+work_dir = './outputs/babilong'
--- a/examples/eval_base_demo.py
+++ b/examples/eval_base_demo.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.demo.demo_gsm8k_base_gen import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.demo.demo_math_base_gen import \
+        math_datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm2_1_8b import \
+        models as hf_internlm2_1_8b_models
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b import \
+        models as hf_qwen2_1_5b_models
+
+datasets = gsm8k_datasets + math_datasets
+models = hf_qwen2_1_5b_models + hf_internlm2_1_8b_models
--- a/examples/eval_bench_intern_s1.py
+++ b/examples/eval_bench_intern_s1.py
+# flake8: noqa
+
+from mmengine.config import read_base
+
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets
+    from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets
+    from opencompass.configs.datasets.gpqa.gpqa_cascade_eval_gen_772ea0 import (
+        gpqa_datasets,
+    )
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de import (
+        mmlu_pro_datasets,
+    )
+    from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import (
+        ifeval_datasets,
+    )
+    from opencompass.configs.datasets.SmolInstruct.smolinstruct_0shot_instruct_gen import (
+        smolinstruct_datasets_0shot_instruct as smolinstruct_datasets,
+    )
+    from opencompass.configs.datasets.ChemBench.ChemBench_llmjudge_gen_c584cf import (
+        chembench_datasets,
+    )
+    from opencompass.configs.datasets.matbench.matbench_llm_judge_gen_0e9276 import (
+        matbench_datasets,
+    )
+    from opencompass.configs.datasets.ProteinLMBench.ProteinLMBench_llmjudge_gen_a67965 import (
+        proteinlmbench_datasets,
+    )
+
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import (
+        mmlu_pro_summary_groups,
+    )
+
+    # Models
+    from opencompass.configs.models.interns1.intern_s1 import \
+        models as interns1_model
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+# Only take LCB generation for evaluation
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')),
+               [])
+
+# LLM judge config: using LLM to evaluate predictions
+judge_cfg = dict()
+
+for item in datasets:
+    item['infer_cfg']['inferencer']['max_out_len'] = 65536
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+    if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys() and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
+        item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
+
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+
+summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
+)
+
+summary_groups.extend(
+    [
+        {
+            'name': 'ChemBench',
+            'subsets': [
+                'ChemBench_Name_Conversion',
+                'ChemBench_Property_Prediction',
+                'ChemBench_Mol2caption',
+                'ChemBench_Caption2mol',
+                'ChemBench_Product_Prediction',
+                'ChemBench_Retrosynthesis',
+                'ChemBench_Yield_Prediction',
+                'ChemBench_Temperature_Prediction',
+            ],
+        },
+    ]
+)
+
+summarizer = dict(
+    dataset_abbrs=[
+        'Knowledge',
+        ['mmlu_pro', 'accuracy'],
+        '',
+        'Instruction Following',
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        '',
+        'General Reasoning',
+        ['GPQA_diamond', 'accuracy'],
+        '',
+        'Math Calculation',
+        ['aime2025', 'accuracy'],
+        '',
+        'Academic',
+        ['ChemBench', 'naive_average'],
+        ['ProteinLMBench', 'accuracy'],
+        '',
+        'SmolInstruct',
+        ['NC-I2F-0shot-instruct', 'score'],
+        ['NC-I2S-0shot-instruct', 'score'],
+        ['NC-S2F-0shot-instruct', 'score'],
+        ['NC-S2I-0shot-instruct', 'score'],
+        ['PP-ESOL-0shot-instruct', 'score'],
+        ['PP-Lipo-0shot-instruct', 'score'],
+        ['PP-BBBP-0shot-instruct', 'accuracy'],
+        ['PP-ClinTox-0shot-instruct', 'accuracy'],
+        ['PP-HIV-0shot-instruct', 'accuracy'],
+        ['PP-SIDER-0shot-instruct', 'accuracy'],
+        ['MC-0shot-instruct', 'score'],
+        ['MG-0shot-instruct', 'score'],
+        ['FS-0shot-instruct', 'score'],
+        ['RS-0shot-instruct', 'score'],
+        '',
+        ['matbench_expt_gap', 'mae'],
+        ['matbench_steels', 'mae'],
+        ['matbench_expt_is_metal', 'accuracy'],
+        ['matbench_glass', 'accuracy'],
+        '',
+    ],
+    summary_groups=summary_groups,
+)
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# infer with local runner
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0,  # Modify if needed
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLEvalTask)),
+)
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+
+work_dir = './outputs/oc_bench_intern_s1'
--- a/examples/eval_bluelm_32k_lveval.py
+++ b/examples/eval_bluelm_32k_lveval.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.lveval.lveval import \
+        LVEval_datasets as datasets
+    from opencompass.configs.models.bluelm.hf_bluelm_7b_chat_32k import models
+    from opencompass.configs.summarizers.lveval import summarizer
+
+models[0]['path'] = '/path/to/your/huggingface_models/BlueLM-7B-Chat-32K'
+models[0][
+    'tokenizer_path'] = '/path/to/your/huggingface_models/BlueLM-7B-Chat-32K'
+models[0]['max_seq_len'] = 32768
+models[0]['generation_kwargs'] = dict(do_sample=False)
+models[0]['mode'] = 'mid'  # truncate in the middle
--- a/examples/eval_cascade_evaluator.py
+++ b/examples/eval_cascade_evaluator.py
+
+from mmengine.config import read_base
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import (
+    GenericLLMEvaluator,
+    CascadeEvaluator,
+    MATHVerifyEvaluator,
+)
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import (
+    MATHDataset,
+    math_postprocess_v2,
+    normalize_final_answer,
+)
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+
+with read_base():
+    # Datasets, Summarizer
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+
+reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+########################## Evaluator  #################################
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+llm_judge_evaluator =   dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+        type=MATHDataset,
+        path='opencompass/math',
+        file_name='test_prm800k_500.json',
+        ),
+        judge_cfg=dict(),
+    )
+
+rule_evaluator =dict(type=MATHVerifyEvaluator)
+cascade_evaluator = dict(type=CascadeEvaluator,
+                   llm_evaluator=llm_judge_evaluator,
+                   rule_evaluator=rule_evaluator,
+                   parallel=False
+                   )
+########################## #################################
+eval_cfg = dict()
+
+# eval_cfg['evaluator'] = rule_evaluator
+# eval_cfg['evaluator'] = llm_judge_evaluator
+eval_cfg['evaluator'] = cascade_evaluator 
+
+math_datasets = [
+    dict(
+        abbr='math_prm800k_500',
+        type=MATHDataset,
+        path='opencompass/math',
+        file_name='test_prm800k_500.json',
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+    )
+]
+
+
+datasets = math_datasets
+models = lmdeploy_qwen2_5_7b_instruct_model
+
+
+work_dir = 'math_prm800k_500_cascade_evaluator'
\ No newline at end of file
--- a/examples/eval_charm_mem.py
+++ b/examples/eval_charm_mem.py
+from mmengine.config import read_base
+from opencompassopencompass.configs.models import OpenAI
+
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.summarizers import CharmMemSummarizer
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+with read_base():
+    from opencompass.configs.datasets.CHARM.charm_memory_gen_bbbd53 import \
+        charm_memory_datasets as datasets
+
+    # ------>>>>>> https://arxiv.org/abs/2403.14112
+    # from opencompass.configs.models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
+    # from opencompass.configs.models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
+    # from opencompass.configs.models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
+    # from opencompass.configs.models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
+    # from opencompass.configs.models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
+    # from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model  # need torch 2.1
+    # from opencompass.configs.models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model  # need torch 2.1
+    # from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
+    # from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
+    # from opencompass.configs.models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
+    # from opencompass.configs.models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
+    # <<<<<<------ https://arxiv.org/abs/2403.14112
+    # from opencompass.configs.models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
+    # from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
+    # from opencompass.configs.models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
+    # from opencompass.configs.models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
+    # from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
+    # from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+## ------------- JudgeLLM Configuration
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+judge_models = [
+    dict(
+        abbr='GPT-3.5-turbo-0125',
+        type=OpenAI,
+        path='gpt-3.5-turbo-0125',
+        key='ENV',
+        meta_template=api_meta_template,
+        query_per_second=16,
+        max_out_len=2048,
+        max_seq_len=2048,
+        batch_size=8,
+        temperature=0,
+    )
+]
+
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveSizePartitioner,
+        max_task_size=1000,
+        mode='singlescore',
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=2,
+                task=dict(type=SubjectiveEvalTask)),
+)
+
+summarizer = dict(type=CharmMemSummarizer)
+
+work_dir = './outputs/CHARM_mem/chat/'
--- a/examples/eval_charm_rea.py
+++ b/examples/eval_charm_rea.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.CHARM.charm_reason_gen_f8fca2 import \
+        charm_reason_datasets as datasets
+
+    # ------>>>>>> https://arxiv.org/abs/2403.14112
+    # from opencompass.configs.models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
+    # from opencompass.configs.models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model
+    # from opencompass.configs.models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model
+    # from opencompass.configs.models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model
+    # from opencompass.configs.models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model
+    # from opencompass.configs.models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
+    # from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model  # need torch 2.1
+    # from opencompass.configs.models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model  # need torch 2.1
+    # from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
+    # from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
+    # from opencompass.configs.models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
+    # from opencompass.configs.models.yi.hf_yi_34b_chat import models as yi_34b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model
+    # <<<<<<------ https://arxiv.org/abs/2403.14112
+    # from opencompass.configs.models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model
+    # from opencompass.configs.models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model
+    # from opencompass.configs.models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model
+    # from opencompass.configs.models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model
+    # from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model
+    # from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model
+    # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model
+    # from opencompass.configs.models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model
+    # from opencompass.configs.models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model
+    from .summarizers.charm_reason import summarizer
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+work_dir = './outputs/CHARM_rea/chat/'
+
+# dataset                                                        version    metric         mode    internlm2-chat-7b-turbomind
+# -------------------------------------------------------------  ---------  -------------  ------  -----------------------------
+# charm-reason-Direct                                               -          naive_average  gen     49.51
+# charm-reason-ZH-CoT                                               -          naive_average  gen     61.33
+# charm-reason-EN-CoT                                               -          naive_average  gen     54.55
+# charm-reason-XLT                                                  -          naive_average  gen     58.46
+# charm-reason-Translate-EN                                         -          naive_average  gen     56.15
+#                                                                -          -              -       -
+# charm-reason-Chinese_Direct                                       -          naive_average  gen     47.14
+# charm-reason-Chinese_ZH-CoT                                       -          naive_average  gen     58.40
+# charm-reason-Chinese_EN-CoT                                       -          naive_average  gen     48.31
+# charm-reason-Chinese_XLT                                          -          naive_average  gen     53.57
+# charm-reason-Chinese_Translate-EN                                 -          naive_average  gen     48.21
+# charm-reason-Global_Direct                                        -          naive_average  gen     51.88
+# charm-reason-Global_ZH-CoT                                        -          naive_average  gen     64.26
+# charm-reason-Global_EN-CoT                                        -          naive_average  gen     60.79
+# charm-reason-Global_XLT                                           -          naive_average  gen     63.36
+# charm-reason-Global_Translate-EN                                  -          naive_average  gen     64.10
--- a/examples/eval_chat_agent.py
+++ b/examples/eval_chat_agent.py
+from lagent import ReAct
+from lagent.agents.react import ReActProtocol
+from mmengine.config import read_base
+
+from opencompass.lagent.actions.python_interpreter import PythonInterpreter
+from opencompass.models.lagent import LagentAgent
+from opencompass.models.openai_api import OpenAI
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.gsm8k.gsm8k_agent_gen_be1606 import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.math.math_agent_gen_af2293 import \
+        math_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_agent_gen_568903 import \
+        mathbench_agent_datasets
+    from opencompass.configs.summarizers.math_agent import summarizer
+
+datasets = []
+datasets += gsm8k_datasets
+datasets += math_datasets
+datasets += mathbench_agent_datasets
+
+system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
+```
+def solution():
+    variable_names_with_real_meaning = func(variable)
+    return variable_names_with_real_meaning
+```"""
+
+protocol = dict(
+    type=ReActProtocol,
+    action=dict(role='ACTION', begin='Tool:', end='\n'),
+    action_input=dict(role='ARGS', begin='Tool Input:', end='\n'),
+    finish=dict(role='FINISH', begin='FinalAnswer:', end='\n'),
+    call_protocol=system_prompt,
+)
+
+models = [
+    dict(
+        abbr='gpt-3.5-react',
+        type=LagentAgent,
+        agent_type=ReAct,
+        max_turn=3,
+        llm=dict(
+            type=OpenAI,
+            path='gpt-3.5-turbo',
+            key='ENV',
+            query_per_second=1,
+            max_seq_len=4096,
+        ),
+        actions=[
+            dict(type=PythonInterpreter),
+        ],
+        protocol=protocol,
+        batch_size=1,
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=1000),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_chat_agent_baseline.py
+++ b/examples/eval_chat_agent_baseline.py
+from mmengine.config import read_base
+
+from opencompass.models.openai_api import OpenAI
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_d6de81 import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.math.math_gen_1ed9c2 import math_datasets
+    from opencompass.configs.datasets.MathBench.mathbench_gen import \
+        mathbench_datasets
+    from opencompass.configs.summarizers.math_baseline import summarizer
+
+datasets = []
+datasets += gsm8k_datasets
+datasets += math_datasets
+datasets += mathbench_datasets
+
+models = [
+    dict(
+        abbr='gpt-3.5-react',
+        type=OpenAI,
+        path='gpt-3.5-turbo',
+        key='ENV',
+        query_per_second=1,
+        max_seq_len=4096,
+        batch_size=1,
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=1000),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_chat_demo.py
+++ b/examples/eval_chat_demo.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.demo.demo_gsm8k_chat_gen import \
+        gsm8k_datasets
+    from opencompass.configs.datasets.demo.demo_math_chat_gen import \
+        math_datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_1_8b import \
+        models as hf_internlm2_chat_1_8b_models
+    from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
+        models as hf_qwen2_1_5b_instruct_models
+
+datasets = gsm8k_datasets + math_datasets
+models = hf_qwen2_1_5b_instruct_models + hf_internlm2_chat_1_8b_models
--- a/examples/eval_chat_last.py
+++ b/examples/eval_chat_last.py
+from mmengine.config import read_base
+
+from opencompass.models.openai_api import OpenAI
+from opencompass.openicl import ChatInferencer
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
+        gsm8k_datasets as datasets
+
+models = [
+    dict(
+        abbr='gpt-3.5',
+        type=OpenAI,
+        path='gpt-3.5-turbo',
+        key='ENV',
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
+
+for dataset in datasets:
+    # Use ChatInferencer instead of GenInferencer
+    dataset['infer_cfg']['inferencer'] = dict(type=ChatInferencer)
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=1000),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_chatml_datasets.py
+++ b/examples/eval_chatml_datasets.py
+# flake8: noqa
+
+from mmengine.config import read_base
+
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+
+    # Models (add your models here)
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
+        models as hf_internlm2_5_7b_chat_model
+
+    # Datasets
+    from opencompass.configs.chatml_datasets.MaScQA.MaScQA_gen import datasets as MaScQA_chatml
+    from opencompass.configs.chatml_datasets.CPsyExam.CPsyExam_gen import datasets as CPsyExam_chatml
+
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+chatml_datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_chatml')),
+    [],
+)
+
+# Your Judge Model Configs Here
+judge_cfg = dict()
+
+for dataset in chatml_datasets:
+    if dataset['evaluator']['type'] == 'llm_evaluator':
+        dataset['evaluator']['judge_cfg'] = judge_cfg
+    if dataset['evaluator']['type'] == 'cascade_evaluator':
+        dataset['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask)),
+)
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=8),
+    runner=dict(
+        type=LocalRunner, task=dict(type=OpenICLEvalTask), max_num_workers=32
+    ),
+)
+
+work_dir = 'outputs/ChatML_Datasets'
\ No newline at end of file
--- a/examples/eval_chembench.py
+++ b/examples/eval_chembench.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.ChemBench.ChemBench_gen import \
+        chembench_datasets
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
+        models
+
+datasets = [*chembench_datasets]
+models = [*models]
+'''
+dataset                           version    metric    mode      mistral-7b-instruct-v0.2-hf
+--------------------------------  ---------  --------  ------  -----------------------------
+ChemBench_Name_Conversion         d4e6a1     accuracy  gen                             45.43
+ChemBench_Property_Prediction     d4e6a1     accuracy  gen                             47.11
+ChemBench_Mol2caption             d4e6a1     accuracy  gen                             64.21
+ChemBench_Caption2mol             d4e6a1     accuracy  gen                             35.38
+ChemBench_Product_Prediction      d4e6a1     accuracy  gen                             38.67
+ChemBench_Retrosynthesis          d4e6a1     accuracy  gen                             27
+ChemBench_Yield_Prediction        d4e6a1     accuracy  gen                             27
+ChemBench_Temperature_Prediction  d4e6a1     accuracy  gen                             26.73
+ChemBench_Solvent_Prediction      d4e6a1     accuracy  gen                             32.67
+'''
--- a/examples/eval_chinese_simpleqa.py
+++ b/examples/eval_chinese_simpleqa.py
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import csimpleqa_datasets
+
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.models.openai_api import OpenAI
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.summarizers import DefaultSubjectiveSummarizer
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+# -------------Inference Stage ----------------------------------------
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='Qwen2.5-1.5B-Instruct',
+        path='Qwen/Qwen2.5-1.5B-Instruct',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        generation_kwargs=dict(do_sample=True, ),
+        max_out_len=200,
+        max_seq_len=4096,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
+
+datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
+summarizer = dict(type=DefaultSubjectiveSummarizer)
+
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
+
+api_meta_template = dict(round=[
+    dict(role='SYSTEM', api_role='SYSTEM'),
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+judge_models = [
+    dict(
+        # GPT4o
+        abbr='gpt-4o-0513-global',
+        type=OpenAI,
+        # gpt-4o
+        path='gpt-4o-0513-global',
+        key='xxx',  # provide OPENAI_API_KEY
+        meta_template=api_meta_template,
+        query_per_second=16,
+        max_out_len=1000,
+        batch_size=8,
+        retry=3)
+]
+
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(type=SubjectiveNaivePartitioner,
+                     models=models,
+                     judge_models=judge_models),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=SubjectiveEvalTask)),
+)
+
+work_dir = 'outputs/chinese_simpleqa/'
--- a/examples/eval_cibench.py
+++ b/examples/eval_cibench.py
+from copy import deepcopy
+
+from lagent import ReAct
+from lagent.agents.react import ReActProtocol
+from mmengine.config import read_base
+
+from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
+from opencompass.lagent.actions.python_interpreter import PythonInterpreter
+from opencompass.lagent.agents.react import CIReAct
+from opencompass.models import HuggingFaceCausalLM
+from opencompass.models.lagent import CodeAgent, LagentAgent
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    # Note that it might occur cuda OOM error for hf model
+    from opencompass.configs.datasets.CIBench.CIBench_generation_gen_8ab0dc import \
+        cibench_datasets as cibench_datasets_generation
+    from opencompass.configs.datasets.CIBench.CIBench_template_gen_e6b12a import \
+        cibench_datasets as cibench_datasets_template
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
+        models as lmdeploy_llama3_8b_instruct_model
+    from opencompass.configs.summarizers.cibench import summarizer
+
+    # Oracle mode for analysis
+    # from opencompass.configs.datasets.CIBench.CIBench_template_oracle_gen_fecda1 import cibench_datasets as cibench_datasets_template_oracle
+    # from opencompass.configs.datasets.CIBench.CIBench_generation_oracle_gen_c4a7c1 import cibench_datasets as cibench_datasets_generation_oracle
+
+datasets = []
+datasets += cibench_datasets_template
+datasets += cibench_datasets_generation
+# datasets += cibench_datasets_template_oracle
+# datasets += cibench_datasets_generation_oracle
+
+_origin_models = sum([v for k, v in locals().items() if k.endswith('_model')],
+                     [])
+
+FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
+
+FEWSHOT_INSTRUCTION = """\
+You are an assistant who can utilize external tools.
+{tool_description}
+To use a tool, please response with the following format:
+```
+{thought} Think what you need to solve, do you need to use tools?
+{action} The tool name, should be one of [{action_names}].
+{action_input} The input to the tool that you want to use.
+```
+The tool will give you response after your response using the following format:
+```
+{response} the results after call the tool.
+```
+Therefore DO NOT generate tool response by yourself.
+
+Also please follow the guidelines:
+1. Always use code interpreter to solve the problem.
+2. The generated codes should always in a markdown code block format.
+3. The generated codes will be executed in an ipython manner and the results will be cached.
+4. Your responded code should always be simple and only solves the problem in current step.
+
+For example:
+
+File url: `xxxx`
+### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
+
+{thought} We should use `pandas` to solve this step.
+{action} IPythonInterpreter
+{action_input} ```python
+import pandas as pd
+url = "xxxx"
+data = pd.read_csv(url)
+```
+{response} The code is succeed without any outputs.
+
+Let us begin from here!
+"""
+
+IPYTHON_INTERPRETER_DESCRIPTION = '''\
+It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
+
+actions = [
+    dict(type=IPythonInterpreter,
+         user_data_dir='./data/cibench_dataset/datasources',
+         description=IPYTHON_INTERPRETER_DESCRIPTION)
+]
+protocol = dict(
+    type=ReActProtocol,
+    call_protocol=FEWSHOT_INSTRUCTION,
+    force_stop=FORCE_STOP_PROMPT_EN,
+    finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
+)
+
+work_dir = './outputs/cibench/'
+
+_agent_models = []
+for m in _origin_models:
+    m = deepcopy(m)
+    if 'meta_template' in m and 'round' in m['meta_template']:
+        round = m['meta_template']['round']
+        if all(r['role'].upper() != 'SYSTEM'
+               for r in round):  # no system round
+            if not any('api_role' in r for r in round):
+                m['meta_template']['round'].append(
+                    dict(role='system', begin='System response:', end='\n'))
+            else:
+                m['meta_template']['round'].append(
+                    dict(role='system', api_role='SYSTEM'))
+            print(
+                f'WARNING: adding SYSTEM round in meta_template for {m.get("abbr", None)}'
+            )
+    _agent_models.append(m)
+
+protocol = dict(
+    type=ReActProtocol,
+    call_protocol=FEWSHOT_INSTRUCTION,
+    force_stop=FORCE_STOP_PROMPT_EN,
+    finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
+)
+
+models = []
+for m in _agent_models:
+    m = deepcopy(m)
+    origin_abbr = m.pop('abbr')
+    abbr = origin_abbr
+    m.pop('batch_size', None)
+    m.pop('max_out_len', None)
+    m.pop('max_seq_len', None)
+    run_cfg = m.pop('run_cfg', {})
+
+    agent_model = dict(
+        abbr=abbr,
+        summarizer_abbr=origin_abbr,
+        type=CodeAgent,
+        agent_type=CIReAct,
+        max_turn=3,
+        llm=m,
+        actions=[
+            dict(type=IPythonInterpreter,
+                 user_data_dir='./data/cibench_dataset/datasources',
+                 description=IPYTHON_INTERPRETER_DESCRIPTION)
+        ],
+        protocol=protocol,
+        batch_size=1,
+        run_cfg=run_cfg,
+    )
+    models.append(agent_model)
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=4,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_cibench_api.py
+++ b/examples/eval_cibench_api.py
+from lagent.agents.react import ReActProtocol
+from mmengine.config import read_base
+
+from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
+from opencompass.lagent.agents.react import CIReAct
+from opencompass.models import OpenAI
+from opencompass.models.lagent import CodeAgent
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.CIBench.CIBench_generation_gen_8ab0dc import \
+        cibench_datasets as cibench_datasets_generation
+    from opencompass.configs.datasets.CIBench.CIBench_template_gen_e6b12a import \
+        cibench_datasets as cibench_datasets_template
+    # Oracle mode for analysis
+    # from opencompass.configs.datasets.CIBench.CIBench_template_oracle_gen_fecda1 import cibench_datasets as cibench_datasets_template_oracle
+    # from opencompass.configs.datasets.CIBench.CIBench_generation_oracle_gen_c4a7c1 import cibench_datasets as cibench_datasets_generation_oracle
+    from opencompass.configs.summarizers.cibench import summarizer
+
+datasets = []
+datasets += cibench_datasets_template
+datasets += cibench_datasets_generation
+# datasets += cibench_datasets_template_oracle
+# datasets += cibench_datasets_generation_oracle
+
+FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
+
+FEWSHOT_INSTRUCTION = """\
+You are an assistant who can utilize external tools.
+{tool_description}
+To use a tool, please response with the following format:
+```
+{thought} Think what you need to solve, do you need to use tools?
+{action} The tool name, should be one of [{action_names}].
+{action_input} The input to the tool that you want to use.
+```
+The tool will give you response after your response using the following format:
+```
+{response} the results after call the tool.
+```
+Therefore DO NOT generate tool response by yourself.
+
+Also please follow the guidelines:
+1. Always use code interpreter to solve the problem.
+2. The generated codes should always in a markdown code block format.
+3. The generated codes will be executed in an ipython manner and the results will be cached.
+4. Your responded code should always be simple and only solves the problem in current step.
+
+For example:
+
+File url: `xxxx`
+### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
+
+{thought} We should use `pandas` to solve this step.
+{action} IPythonInterpreter
+{action_input} ```python
+import pandas as pd
+url = "xxxx"
+data = pd.read_csv(url)
+```
+{response} The code is succeed without any outputs.
+
+Let us begin from here!
+"""
+
+IPYTHON_INTERPRETER_DESCRIPTION = '''\
+It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+    dict(role='SYSTEM', api_role='SYSTEM'),
+], )
+
+actions = [
+    dict(type=IPythonInterpreter,
+         user_data_dir='./data/cibench_dataset/datasources',
+         description=IPYTHON_INTERPRETER_DESCRIPTION)
+]
+protocol = dict(
+    type=ReActProtocol,
+    call_protocol=FEWSHOT_INSTRUCTION,
+    force_stop=FORCE_STOP_PROMPT_EN,
+    finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
+)
+
+work_dir = 'outputs/cibench/'
+models = [
+    dict(
+        abbr='gpt-4o',
+        type=CodeAgent,
+        agent_type=CIReAct,
+        max_turn=3,
+        llm=dict(
+            type=OpenAI,
+            path='gpt-4o',
+            rpm_verbose=True,
+            retry=99,
+            meta_template=api_meta_template,
+            query_per_second=1,
+            max_seq_len=2048,
+            temperature=0,
+        ),
+        actions=actions,
+        protocol=protocol,
+        batch_size=1,
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=4,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_circular.py
+++ b/examples/eval_circular.py
+from mmengine.config import read_base
+
+from opencompass.datasets.circular import (
+    CircularARCDataset, CircularCEvalDataset, CircularCMMLUDataset,
+    CircularCSQADataset, CircularEvaluator, CircularHSWAGDataset,
+    CircularMMLUDataset, CircularOBQADataset, CircularRaceDataset)
+from opencompass.summarizers import CircularSummarizer
+
+with read_base():
+    from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import \
+        ARC_c_datasets
+    from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import \
+        ARC_e_datasets
+    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
+        ceval_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import \
+        cmmlu_datasets
+    from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_1da2d0 import \
+        commonsenseqa_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import \
+        hellaswag_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
+    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm_chat_7b import \
+        models as hf_internlm_chat_7b_model
+    from opencompass.configs.models.hf_internlm.hf_internlm_chat_20b import \
+        models as hf_internlm_chat_20b_model
+    from opencompass.configs.models.qwen.hf_qwen_7b_chat import \
+        models as hf_qwen_7b_chat_model
+    from opencompass.configs.models.qwen.hf_qwen_14b_chat import \
+        models as hf_qwen_14b_chat_model
+    from opencompass.configs.summarizers.groups.ceval import \
+        ceval_summary_groups
+    from opencompass.configs.summarizers.groups.cmmlu import \
+        cmmlu_summary_groups
+    from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups
+
+for ds, t in [
+    (ceval_datasets, CircularCEvalDataset),
+    (mmlu_datasets, CircularMMLUDataset),
+    (cmmlu_datasets, CircularCMMLUDataset),
+    (hellaswag_datasets, CircularHSWAGDataset),
+    (ARC_e_datasets, CircularARCDataset),
+    (ARC_c_datasets, CircularARCDataset),
+    (commonsenseqa_datasets, CircularCSQADataset),
+    (obqa_datasets, CircularOBQADataset),
+    (race_datasets, CircularRaceDataset),
+]:
+    for d in ds:
+        d['type'] = t
+        d['abbr'] = d['abbr'] + '-circular-4'
+        d['eval_cfg']['evaluator'] = {
+            'type': CircularEvaluator,
+            'circular_pattern': 'circular'
+        }
+        d['circular_patterns'] = 'circular'
+
+datasets = sum([
+    v
+    for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'
+], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+# config summarizer
+other_summary_groups = [
+    {
+        'name':
+        'average',
+        'subsets': [
+            'ceval', 'mmlu', 'cmmlu', 'hellaswag', 'ARC-e', 'ARC-c',
+            'commonsense_qa', 'openbookqa_fact', 'race-middle', 'race-high'
+        ]
+    },
+]
+origin_summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
+new_summary_groups = []
+for item in origin_summary_groups:
+    new_summary_groups.append({
+        'name':
+        item['name'] + '-circular-4',
+        'subsets': [i + '-circular-4' for i in item['subsets']],
+    })
+summarizer = dict(
+    type=CircularSummarizer,
+    metric_types=['acc_origin', 'perf_circular'],
+    dataset_abbrs=[
+        'average-circular-4',
+        'ceval-circular-4',
+        'mmlu-circular-4',
+        'cmmlu-circular-4',
+        'hellaswag-circular-4',
+        'ARC-e-circular-4',
+        'ARC-c-circular-4',
+        'commonsense_qa-circular-4',
+        'openbookqa_fact-circular-4',
+        'race-middle-circular-4',
+        'race-high-circular-4',
+        'ceval-humanities-circular-4',
+        'ceval-stem-circular-4',
+        'ceval-social-science-circular-4',
+        'ceval-other-circular-4',
+        'mmlu-humanities-circular-4',
+        'mmlu-stem-circular-4',
+        'mmlu-social-science-circular-4',
+        'mmlu-other-circular-4',
+        'cmmlu-humanities-circular-4',
+        'cmmlu-stem-circular-4',
+        'cmmlu-social-science-circular-4',
+        'cmmlu-other-circular-4',
+        'cmmlu-china-specific-circular-4',
+    ],
+    summary_groups=new_summary_groups,
+)
--- a/examples/eval_claude.py
+++ b/examples/eval_claude.py
+from mmengine.config import read_base
+
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.collections.chat_medium import datasets
+    from opencompass.configs.models.claude.claude import models
+    # and output the results in a choosen format
+    from opencompass.configs.summarizers.medium import summarizer
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(type=LocalRunner,
+                max_num_workers=8,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_code_passk.py
+++ b/examples/eval_code_passk.py
+# This config is used for pass@k evaluation with `num_return_sequences`
+# That model can generate multiple responses for single input
+from mmengine.config import read_base
+
+from opencompass.models import HuggingFaceCausalLM
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.humaneval.humaneval_passk_gen_8e312c import \
+        humaneval_datasets
+    from opencompass.configs.datasets.mbpp.deprecated_mbpp_passk_gen_1e1056 import \
+        mbpp_datasets
+    from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_passk_gen_1e1056 import \
+        sanitized_mbpp_datasets
+
+datasets = []
+datasets += humaneval_datasets
+datasets += mbpp_datasets
+datasets += sanitized_mbpp_datasets
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='CodeLlama-7b-Python',
+        path='codellama/CodeLlama-7b-Python-hf',
+        tokenizer_path='codellama/CodeLlama-7b-Python-hf',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        generation_kwargs=dict(
+            num_return_sequences=10,
+            do_sample=True,
+            top_p=0.95,
+            temperature=0.8,
+        ),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=300),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)
--- a/examples/eval_code_passk_repeat_dataset.py
+++ b/examples/eval_code_passk_repeat_dataset.py
+# This config is used for pass@k evaluation with dataset repetition
+# That model cannot generate multiple response for single input
+from mmengine.config import read_base
+
+from opencompass.models import HuggingFaceCausalLM
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from opencompass.configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import \
+        humaneval_datasets
+    from opencompass.configs.datasets.mbpp.deprecated_mbpp_repeat10_gen_1e1056 import \
+        mbpp_datasets
+    from opencompass.configs.datasets.mbpp.deprecated_sanitized_mbpp_repeat10_gen_1e1056 import \
+        sanitized_mbpp_datasets
+
+datasets = []
+datasets += humaneval_datasets
+datasets += mbpp_datasets
+datasets += sanitized_mbpp_datasets
+
+_meta_template = dict(round=[
+    dict(role='HUMAN', begin='<|User|>:', end='\n'),
+    dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
+], )
+
+models = [
+    dict(
+        abbr='internlm-chat-7b-hf-v11',
+        type=HuggingFaceCausalLM,
+        path='internlm/internlm-chat-7b-v1_1',
+        tokenizer_path='internlm/internlm-chat-7b-v1_1',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            use_fast=False,
+            trust_remote_code=True,
+        ),
+        max_seq_len=2048,
+        meta_template=_meta_template,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        generation_kwargs=dict(
+            do_sample=True,
+            top_p=0.95,
+            temperature=0.8,
+        ),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        batch_size=8,
+    )
+]
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=600),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLInferTask)),
+)