Initial commit

be3dfa50 · jerrrrry · be3dfa50 · be3dfa50 · be3dfa50 · be3dfa50
Commit be3dfa50 authored Aug 06, 2025 by jerrrrry
20 changed files
--- a/examples/vllm/eval_mixtral_series_instruct_vllm.py
+++ b/examples/vllm/eval_mixtral_series_instruct_vllm.py
+from mmengine.config import read_base
+with read_base():
+    # from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets 
+    # from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..summarizers.example import summarizer
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+work_dir = './outputs/mixtral-series-instruct/'
+from opencompass.models import VLLMwithChatTemplate
+settings = [
+    ('mixtral-8x7b-instruct-v0.1-vllm', 'mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
+    ('mixtral-8x22b-instruct-v0.1-vllm', 'mistralai/Mixtral-8x22B-Instruct-v0.1', 8),
+    ('mixtral-large-instruct-2407-vllm', 'mistralai/Mistral-Large-Instruct-2407', 8),
+]
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLMwithChatTemplate,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus,gpu_memory_utilization=0.9), # add quantization="awq" or quantization="gptq" to eval quantization models
+            max_out_len=256,
+            batch_size=16,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus),
+        )
+    )
--- a/examples/vllm/eval_mixtral_series_vllm.py
+++ b/examples/vllm/eval_mixtral_series_vllm.py
+from mmengine.config import read_base
+with read_base():
+    # from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets 
+    # from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..summarizers.example import summarizer
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+work_dir = './outputs/mixtral-series/'
+from opencompass.models import VLLM
+settings = [
+    ('mixtral-8x7b-v0.1-vllm', 'mistralai/Mixtral-8x7B-v0.1', 2),
+    ('mixtral-8x22b-v0.1-vllm', 'mistralai/Mixtral-8x22B-v0.1', 8),
+]
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLM,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus, gpu_memory_utilization=0.9, dtype='float16',), # add quantization="awq" or quantization="gptq" to eval quantization models
+            max_out_len=1024,
+            batch_size=16,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus),
+        )
+    )
--- a/examples/vllm/eval_model_with_think.py
+++ b/examples/vllm/eval_model_with_think.py
+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import aime2024_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
+    from opencompass.configs.datasets.livecodebench.livecodebench_gen_6966bc import LCB_datasets
+    from opencompass.configs.datasets.math.math_500_gen import math_datasets
+    from opencompass.configs.datasets.ceval.ceval_zero_shot_gen_bd40ef import ceval_datasets
+    from opencompass.configs.datasets.humaneval.humaneval_gen import humaneval_datasets
+    from opencompass.configs.summarizers.example import summarizer
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+work_dir = '/workspace/logs/' #输出日志路径
+from opencompass.models import OpenAISDK
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], )
+models = [
+    dict(
+        abbr='DeepSeek-R1-INT8', # 输出log中记录模型名称
+        type=OpenAISDK,
+        path='/nvme/models/DeepSeek-R1-INT8/', # server中设置的模型名称，未设置同server中的模型路径
+        openai_api_base="http://0.0.0.0:8000/v1", # api端口
+        tokenizer_path="/nvme/models/DeepSeek-R1-INT8", # 模型路径，用于输入prompt准备
+        key='EMPTY',
+        meta_template=api_meta_template, # 对应chat验证，base请注释
+        temperature=0,
+        query_per_second=64,
+        max_out_len=32768,
+        max_seq_len=32768,
+        pred_postprocessor=dict(
+            type='opencompass.utils.text_postprocessors.extract_non_reasoning_content'),
+        batch_size=32),
+]
--- a/examples/vllm/eval_mulit_model.py
+++ b/examples/vllm/eval_mulit_model.py
+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from opencompass.configs.datasets.livecodebench.livecodebench_gen_6966bc import LCB_datasets
+    from opencompass.configs.datasets.math.math_500_gen import math_datasets
+    from opencompass.configs.datasets.ceval.ceval_zero_shot_gen_bd40ef import ceval_datasets
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_gen_cdbebf import mmlu_pro_datasets
+    from opencompass.configs.datasets.humaneval.humaneval_gen import humaneval_datasets
+    from opencompass.configs.summarizers.example import summarizer
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+work_dir = '/workspace/logs/offline'  #输出日志路径
+from opencompass.models import VLLM
+from opencompass.models import VLLMwithChatTemplate
+settings = [  # abbr, path, tp, enforce_eager, data_type, max_len, batch_size
+    ('Qwen3-32B', '/models/qwen3/Qwen3-32B', 2, False, 'bfloat16', 32768, 32),
+    ('Qwen3-30B-A3B', '/models/qwen3/Qwen3-30B-A3B', 2, False, 'bfloat16',32768, 32),
+]
+models = []
+for abbr, path, tp, eager, data_type, max_len, batch_size in settings:
+    models.append(
+        dict(
+            type=VLLMwithChatTemplate, # chat验证请用VLLMwithChatTemplate，base验证使用VLLM
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=tp,
+                              dtype=data_type,
+                              max_model_len=max_len,
+                              enforce_eager=eager,
+                              gpu_memory_utilization=0.95,
+                            # int4 模型请添加 quantization="awq" 或 quantization="gptq"
+                              ),
+            max_out_len=max_len,
+            max_seq_len=max_len,
+            batch_size=batch_size,
+            pred_postprocessor=dict(
+            type='opencompass.utils.text_postprocessors.extract_non_reasoning_content'),
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=tp, num_procs=1),
+        )
+    )
+infer = dict(
+        partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=8,    # 每个模型划分出多少个task，建议不大于max_num_workers
+        num_split=8,   # 每个数据集将被划分成多少份。若为 None，则使用 num_worker。
+        min_task_size=16, # 每个划分的最小数据条目数
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=8, #最大并行执行的task数，建议设置为 gpu数量 / 模型tp
+        task=dict(type=OpenICLInferTask),  # 待运行的任务
+    )
+)
--- a/examples/vllm/eval_qwen1.5_series_chat_vllm.py
+++ b/examples/vllm/eval_qwen1.5_series_chat_vllm.py
+from mmengine.config import read_base
+with read_base():
+    # from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets 
+    # from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..summarizers.example import summarizer
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+work_dir = './outputs/qwen1.5-series-chat/'
+from opencompass.models import VLLMwithChatTemplate
+settings = [
+    ('qwen1.5-0.5b-chat-vllm', 'Qwen/Qwen1.5-0.5B-Chat', 1),
+    ('qwen1.5-1.8b-chat-vllm', 'Qwen/Qwen1.5-1.8B-Chat', 1),
+    ('qwen1.5-4b-chat-vllm', 'Qwen/Qwen1.5-4B-Chat', 1),
+    ('qwen1.5-7b-chat-vllm', 'Qwen/Qwen1.5-7B-Chat', 1),
+    ('qwen1.5-14b-chat-vllm', 'Qwen/Qwen1.5-14B-Chat', 1),
+    ('qwen1.5-32b-chat-vllm', 'Qwen/Qwen1.5-32B-Chat', 1),
+    ('qwen1.5-72b-chat-vllm', 'Qwen/Qwen1.5-72B-Chat', 4),
+    ('qwen1.5-110b-chat-vllm', 'Qwen/Qwen1.5-110B-Chat', 4),
+]
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLMwithChatTemplate,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus), # add quantization="awq" or quantization="gptq" to eval quantization models
+            max_out_len=1024,
+            batch_size=16,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus),
+        )
+    )
\ No newline at end of file
--- a/examples/vllm/eval_qwen1.5_series_vllm.py
+++ b/examples/vllm/eval_qwen1.5_series_vllm.py
+from mmengine.config import read_base
+with read_base():
+    # from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets 
+    # from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..summarizers.example import summarizer
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+work_dir = './outputs/qwen1.5-series/'
+from opencompass.models import VLLM
+settings = [
+    ('qwen1.5-0.5b-vllm', 'Qwen/Qwen1.5-0.5B', 1),
+    ('qwen1.5-1.8b-vllm', 'Qwen/Qwen1.5-1.8B', 1),
+    ('qwen1.5-4b-vllm', 'Qwen/Qwen1.5-4B', 1),
+    ('qwen1.5-7b-vllm', 'Qwen/Qwen1.5-7B', 1),
+    ('qwen1.5-14b-vllm', 'Qwen/Qwen1.5-14B', 1),
+    ('qwen1.5-32b-vllm', 'Qwen/Qwen1.5-32B', 2),
+    ('qwen1.5-72b-vllm', 'Qwen/Qwen1.5-72B', 4),
+    ('qwen1.5-110b-vllm', 'Qwen/Qwen1.5-110B', 4),
+]
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLM,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus), # add quantization="awq" or quantization="gptq" to eval quantization models
+            max_out_len=100,
+            max_seq_len=2048,
+            batch_size=32,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus, num_procs=1),
+        )
+    )
--- a/examples/vllm/eval_qwen2.5_series_instruct_vllm.py
+++ b/examples/vllm/eval_qwen2.5_series_instruct_vllm.py
+from mmengine.config import read_base
+with read_base():
+    # from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets 
+    # from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..summarizers.example import summarizer
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+work_dir = './outputs/qwen2.5-series/'
+from opencompass.models import VLLMwithChatTemplate
+settings = [
+    ('qwen2.5-0.5b-instruct-vllm', 'Qwen/Qwen2.5-0.5B-Instruct', 1),
+    ('qwen2.5-1.5b-instruct-vllm', 'Qwen/Qwen2.5-1.5B-Instruct', 1),
+    ('qwen2.5-3b-instruct-vllm', 'Qwen/Qwen2.5-3B-Instruct', 1),
+    ('qwen2.5-7b-instruct-vllm', 'Qwen/Qwen2.5-7B-Instruct', 1),
+    ('qwen2.5-14b-instruct-vllm', 'Qwen/Qwen2.5-14B-Instruct', 2),
+    ('qwen2.5-32b-instruct-vllm', 'Qwen/Qwen2.5-32B-Instruct', 2),
+    ('qwen2_5-72b-instruct-vllm', 'Qwen/Qwen2.5-72B-Instruct', 4),
+]
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLMwithChatTemplate,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus,gpu_memory_utilization=0.9), # add quantization="awq" or quantization="gptq" to eval quantization models
+            max_out_len=4096,
+            batch_size=16,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus, num_procs=1),
+        )
+    )
--- a/examples/vllm/eval_qwen2.5_series_openai_vllm.py
+++ b/examples/vllm/eval_qwen2.5_series_openai_vllm.py
+from mmengine.config import read_base
+from opencompass.models import OpenAI
+with read_base():
+    # from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets 
+    # from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..summarizers.example import summarizer
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], )
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+work_dir = './outputs/qwen2.5-series/'
+settings = [
+    ('qwen2.5-0.5b-instruct-vllm', 'Qwen/Qwen2.5-0.5B-Instruct', 1),
+    ('qwen2.5-1.5b-instruct-vllm', 'Qwen/Qwen2.5-1.5B-Instruct', 1),
+    ('qwen2.5-3b-instruct-vllm', 'Qwen/Qwen2.5-3B-Instruct', 1),
+    ('qwen2.5-7b-instruct-vllm', 'Qwen/Qwen2.5-7B-Instruct', 1),
+    ('qwen2.5-14b-instruct-vllm', 'Qwen/Qwen2.5-14B-Instruct', 2),
+    ('qwen2.5-32b-instruct-vllm', 'Qwen/Qwen2.5-32B-Instruct', 2),
+    ('qwen2_5-72b-instruct-vllm', 'Qwen/Qwen2.5-72B-Instruct', 4),
+]
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=OpenAI,
+            abbr=abbr,
+            path=path,
+            openai_api_base='http://0.0.0.0:8000/v1/chat/completions',
+            key='ENV',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+            meta_template=api_meta_template,
+            # query_per_second=1,
+            max_out_len=4096,
+            max_seq_len=4096,
+            batch_size=16,
+        )
+    )
\ No newline at end of file
--- a/examples/vllm/eval_qwen2_series_instruct_vllm.py
+++ b/examples/vllm/eval_qwen2_series_instruct_vllm.py
+from mmengine.config import read_base
+with read_base():
+    # from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets 
+    # from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..summarizers.example import summarizer
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+work_dir = './outputs/qwen2-series-instruct/'
+from opencompass.models import VLLMwithChatTemplate
+settings = [
+    ('qwen2-0.5b-instruct-vllm', 'Qwen/Qwen2-0.5B-Instruct', 1),
+    ('qwen2-1.5b-instruct-vllm', 'Qwen/Qwen2-1.5B-Instruct', 1),
+    ('qwen2-7b-instruct-vllm', 'Qwen/Qwen2-7B-Instruct', 1),
+    ('qwen2-57b-a14b-instruct-vllm', 'Qwen/Qwen2-57B-A14B-Instruct', 1),
+    ('qwen2-72b-instruct-vllm', 'Qwen/Qwen2-72B-Instruct', 2),
+]
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLMwithChatTemplate,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus,gpu_memory_utilization=0.9), # add quantization="awq" or quantization="gptq" to eval quantization models
+            max_out_len=4096,
+            batch_size=16,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus, num_procs=1),
+        )
+    )
--- a/examples/vllm/eval_qwen2_series_vllm.py
+++ b/examples/vllm/eval_qwen2_series_vllm.py
+from mmengine.config import read_base
+with read_base():
+    # from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets 
+    # from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..summarizers.example import summarizer
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+work_dir = './outputs/qwen2-series/'
+from opencompass.models import VLLM
+settings = [
+    ('qwen2-0.5b-vllm', 'Qwen/Qwen2-0.5B', 1),
+    ('qwen2-1.5b-vllm', 'Qwen/Qwen2-1.5B', 1),
+    ('qwen2-7b-vllm', 'Qwen/Qwen2-7B', 1),
+    ('qwen2-72b-vllm', 'Qwen/Qwen2-72B', 4),
+    ('qwen2-57b-a14b-vllm', 'Qwen/Qwen2-57B-A14B', 2),
+]
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLM,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus,gpu_memory_utilization=0.9), # add quantization="awq" or quantization="gptq" to eval quantization models
+            max_out_len=100,
+            max_seq_len=2048,
+            batch_size=32,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus, num_procs=1),
+        )
+    )
--- a/examples/vllm/eval_qwen_series_chat_vllm.py
+++ b/examples/vllm/eval_qwen_series_chat_vllm.py
+from mmengine.config import read_base
+with read_base():
+    # from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets 
+    # from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..summarizers.example import summarizer
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+work_dir = './outputs/qwen-series-chat/'
+from opencompass.models import VLLMwithChatTemplate
+settings = [
+    ('qwen-1.8b-chat-vllm', 'Qwen/Qwen-1_8B-Chat', 1),
+    ('qwen-7b-chat-vllm', 'Qwen/Qwen-7B-Chat', 1),
+    ('qwen-14b-chat-vllm', 'Qwen/Qwen-14B-Chat', 1),
+    ('qwen-72b-chat-vllm', 'Qwen/Qwen-72B-Chat', 4),
+]
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLMwithChatTemplate,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus), # add quantization="awq" or quantization="gptq" to eval quantization models
+            max_out_len=1024,
+            batch_size=16,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus),
+        )
+    )
\ No newline at end of file
--- a/examples/vllm/eval_qwen_series_vllm.py
+++ b/examples/vllm/eval_qwen_series_vllm.py
+from mmengine.config import read_base
+with read_base():
+    # from ..datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets 
+    # from ..datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ..datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..summarizers.example import summarizer
+datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
+work_dir = './outputs/qwen-series/'
+from opencompass.models import VLLM
+settings = [
+    ('qwen-1.8b-vllm', 'Qwen/Qwen-1_8B', 1),
+    ('qwen-7b-vllm', 'Qwen/Qwen-7B', 1),
+    ('qwen-14b-vllm', 'Qwen/Qwen-14B', 1),
+    ('qwen-72b-vllm', 'Qwen/Qwen-72B', 4),
+]
+models = []
+for abbr, path, num_gpus in settings:
+    models.append(
+        dict(
+            type=VLLM,
+            abbr=abbr,
+            path=path,
+            model_kwargs=dict(tensor_parallel_size=num_gpus), # add quantization="awq" or quantization="gptq" to eval quantization models
+            max_out_len=100,
+            max_seq_len=2048,
+            batch_size=32,
+            generation_kwargs=dict(temperature=0),
+            run_cfg=dict(num_gpus=num_gpus, num_procs=1),
+        )
+    )
--- a/opencompass/__init__.py
+++ b/opencompass/__init__.py
+__version__ = '0.4.1'
--- a/opencompass/cli/__init__.py
+++ b/opencompass/cli/__init__.py
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
+# flake8: noqa
+# yapf: disable
+import argparse
+import copy
+import getpass
+import os
+import os.path as osp
+from datetime import datetime
+from mmengine.config import Config, DictAction
+from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
+from opencompass.runners import SlurmRunner
+from opencompass.summarizers import DefaultSummarizer
+from opencompass.utils import LarkReporter, get_logger
+from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg,
+                                   get_config_from_arg)
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run an evaluation task')
+    parser.add_argument('config', nargs='?', help='Train config file path')
+    # add mutually exclusive args `--slurm` `--dlc`, defaults to local runner
+    # if "infer" or "eval" not specified
+    launch_method = parser.add_mutually_exclusive_group()
+    launch_method.add_argument('--slurm',
+                               action='store_true',
+                               default=False,
+                               help='Whether to force tasks to run with srun. '
+                               'If True, `--partition(-p)` must be set. '
+                               'Defaults to False')
+    launch_method.add_argument('--dlc',
+                               action='store_true',
+                               default=False,
+                               help='Whether to force tasks to run on dlc. If '
+                               'True, `--aliyun-cfg` must be set. Defaults'
+                               ' to False')
+    # Add shortcut parameters (models, datasets and summarizer)
+    parser.add_argument('--models', nargs='+', help='', default=None)
+    parser.add_argument('--datasets', nargs='+', help='', default=None)
+    parser.add_argument('--summarizer', help='', default=None)
+    # add general args
+    parser.add_argument('--debug',
+                        help='Debug mode, in which scheduler will run tasks '
+                        'in the single process, and output will not be '
+                        'redirected to files',
+                        action='store_true',
+                        default=False)
+    parser.add_argument('--dry-run',
+                        help='Dry run mode, in which the scheduler will not '
+                        'actually run the tasks, but only print the commands '
+                        'to run',
+                        action='store_true',
+                        default=False)
+    parser.add_argument(
+        '-a', '--accelerator',
+        help='Infer accelerator, support vllm and lmdeploy now.',
+        choices=['vllm', 'lmdeploy', None],
+        default=None,
+        type=str)
+    parser.add_argument('-m',
+                        '--mode',
+                        help='Running mode. You can choose "infer" if you '
+                        'only want the inference results, or "eval" if you '
+                        'already have the results and want to evaluate them, '
+                        'or "viz" if you want to visualize the results.',
+                        choices=['all', 'infer', 'eval', 'viz'],
+                        default='all',
+                        type=str)
+    parser.add_argument('-r',
+                        '--reuse',
+                        nargs='?',
+                        type=str,
+                        const='latest',
+                        help='Reuse previous outputs & results, and run any '
+                        'missing jobs presented in the config. If its '
+                        'argument is not specified, the latest results in '
+                        'the work_dir will be reused. The argument should '
+                        'also be a specific timestamp, e.g. 20230516_144254')
+    parser.add_argument('-w',
+                        '--work-dir',
+                        help='Work path, all the outputs will be '
+                        'saved in this path, including the slurm logs, '
+                        'the evaluation results, the summary results, etc.'
+                        'If not specified, the work_dir will be set to '
+                        'outputs/default.',
+                        default=None,
+                        type=str)
+    parser.add_argument(
+        '--config-dir',
+        default='configs',
+        help='Use the custom config directory instead of config/ to '
+        'search the configs for datasets, models and summarizers',
+        type=str)
+    parser.add_argument('-l',
+                        '--lark',
+                        help='Report the running status to lark bot',
+                        action='store_true',
+                        default=False)
+    parser.add_argument('--max-num-workers',
+                        help='Max number of workers to run in parallel. '
+                        'Will be overrideen by the "max_num_workers" argument '
+                        'in the config.',
+                        type=int,
+                        default=1)
+    parser.add_argument('--max-workers-per-gpu',
+                        help='Max task to run in parallel on one GPU. '
+                        'It will only be used in the local runner.',
+                        type=int,
+                        default=1)
+    parser.add_argument(
+        '--retry',
+        help='Number of retries if the job failed when using slurm or dlc. '
+        'Will be overrideen by the "retry" argument in the config.',
+        type=int,
+        default=2)
+    parser.add_argument(
+        '--dump-eval-details',
+        help='Whether to dump the evaluation details, including the '
+        'correctness of each sample, bpb, etc.',
+        action='store_true',
+    )
+    parser.add_argument(
+        '--dump-extract-rate',
+        help='Whether to dump the evaluation details, including the '
+        'correctness of each sample, bpb, etc.',
+        action='store_true',
+    )
+    # set srun args
+    slurm_parser = parser.add_argument_group('slurm_args')
+    parse_slurm_args(slurm_parser)
+    # set dlc args
+    dlc_parser = parser.add_argument_group('dlc_args')
+    parse_dlc_args(dlc_parser)
+    # set hf args
+    hf_parser = parser.add_argument_group('hf_args')
+    parse_hf_args(hf_parser)
+    # set custom dataset args
+    custom_dataset_parser = parser.add_argument_group('custom_dataset_args')
+    parse_custom_dataset_args(custom_dataset_parser)
+    args = parser.parse_args()
+    if args.slurm:
+        assert args.partition is not None, (
+            '--partition(-p) must be set if you want to use slurm')
+    if args.dlc:
+        assert os.path.exists(args.aliyun_cfg), (
+            'When launching tasks using dlc, it needs to be configured '
+            'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"'
+            ' to specify a new path.')
+    return args
+def parse_slurm_args(slurm_parser):
+    """These args are all for slurm launch."""
+    slurm_parser.add_argument('-p',
+                              '--partition',
+                              help='Slurm partition name',
+                              default=None,
+                              type=str)
+    slurm_parser.add_argument('-q',
+                              '--quotatype',
+                              help='Slurm quota type',
+                              default=None,
+                              type=str)
+    slurm_parser.add_argument('--qos',
+                              help='Slurm quality of service',
+                              default=None,
+                              type=str)
+def parse_dlc_args(dlc_parser):
+    """These args are all for dlc launch."""
+    dlc_parser.add_argument('--aliyun-cfg',
+                            help='The config path for aliyun config',
+                            default='~/.aliyun.cfg',
+                            type=str)
+def parse_hf_args(hf_parser):
+    """These args are all for the quick construction of HuggingFace models."""
+    hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat')
+    hf_parser.add_argument('--hf-path', type=str, help='The path to the HuggingFace model, e.g. "facebook/opt-125m", required')
+    hf_parser.add_argument('--model-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the HuggingFace model')
+    hf_parser.add_argument('--tokenizer-path', type=str, help='The path to the HuggingFace tokenizer, same as --hf-path if not specified')
+    hf_parser.add_argument('--tokenizer-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the tokenizer')
+    hf_parser.add_argument('--peft-path', type=str, help='The path to the PEFT model')
+    hf_parser.add_argument('--peft-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the PEFT model')
+    hf_parser.add_argument('--generation-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the generation')
+    hf_parser.add_argument('--max-seq-len', type=int, help='The max sequence length for the HuggingFace model')
+    hf_parser.add_argument('--max-out-len', type=int, default=256, help='The max output length for the HuggingFace model')
+    hf_parser.add_argument('--min-out-len', type=int, default=1, help='The min output length for the HuggingFace model')
+    hf_parser.add_argument('--batch-size', type=int, default=8, help='The batch size for the HuggingFace model')
+    hf_parser.add_argument('--num-gpus', type=int, default=None, help='Deprecated, please use --hf-num-gpus instead')
+    hf_parser.add_argument('--hf-num-gpus', type=int, default=1, help='The number of GPUs for the HuggingFace model passed via cli')
+    hf_parser.add_argument('--pad-token-id', type=int, help='The pad token id for the HuggingFace model')
+    hf_parser.add_argument('--stop-words', nargs='+', default=[], help='The stop words for the HuggingFace model')
+def parse_custom_dataset_args(custom_dataset_parser):
+    """These args are all for the quick construction of custom datasets."""
+    custom_dataset_parser.add_argument('--custom-dataset-path', type=str)
+    custom_dataset_parser.add_argument('--custom-dataset-meta-path', type=str)
+    custom_dataset_parser.add_argument('--custom-dataset-data-type',
+                                       type=str,
+                                       choices=['mcq', 'qa'])
+    custom_dataset_parser.add_argument('--custom-dataset-infer-method',
+                                       type=str,
+                                       choices=['gen', 'ppl'])
+def main():
+    args = parse_args()
+    if args.num_gpus is not None:
+        raise ValueError('The `--num-gpus` argument is deprecated, please use '
+                         '`--hf-num-gpus` to describe number of gpus used for '
+                         'the HuggingFace model instead.')
+    if args.dry_run:
+        args.debug = True
+    # initialize logger
+    logger = get_logger(log_level='DEBUG' if args.debug else 'INFO')
+    cfg = get_config_from_arg(args)
+    if args.work_dir is not None:
+        cfg['work_dir'] = args.work_dir
+    else:
+        cfg.setdefault('work_dir', os.path.join('outputs', 'default'))
+    # cfg_time_str defaults to the current time
+    cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
+    if args.reuse:
+        if args.reuse == 'latest':
+            if not os.path.exists(cfg.work_dir) or not os.listdir(
+                    cfg.work_dir):
+                logger.warning('No previous results to reuse!')
+            else:
+                dirs = os.listdir(cfg.work_dir)
+                dir_time_str = sorted(dirs)[-1]
+        else:
+            dir_time_str = args.reuse
+        logger.info(f'Reusing experiements from {dir_time_str}')
+    elif args.mode in ['eval', 'viz']:
+        raise ValueError('You must specify -r or --reuse when running in eval '
+                         'or viz mode!')
+    # update "actual" work_dir
+    cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
+    current_workdir = cfg['work_dir']
+    logger.info(f'Current exp folder: {current_workdir}')
+    os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True)
+    # dump config
+    output_config_path = osp.join(cfg.work_dir, 'configs',
+                                  f'{cfg_time_str}_{os.getpid()}.py')
+    cfg.dump(output_config_path)
+    # Config is intentally reloaded here to avoid initialized
+    # types cannot be serialized
+    cfg = Config.fromfile(output_config_path, format_python_code=False)
+    # report to lark bot if specify --lark
+    if not args.lark:
+        cfg['lark_bot_url'] = None
+    elif cfg.get('lark_bot_url', None):
+        content = f'{getpass.getuser()}\'s task has been launched!'
+        LarkReporter(cfg['lark_bot_url']).post(content)
+    if args.mode in ['all', 'infer']:
+        # When user have specified --slurm or --dlc, or have not set
+        # "infer" in config, we will provide a default configuration
+        # for infer
+        if (args.dlc or args.slurm) and cfg.get('infer', None):
+            logger.warning('You have set "infer" in the config, but '
+                           'also specified --slurm or --dlc. '
+                           'The "infer" configuration will be overridden by '
+                           'your runtime arguments.')
+        if args.dlc or args.slurm or cfg.get('infer', None) is None:
+            fill_infer_cfg(cfg, args)
+        if args.partition is not None:
+            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
+                cfg.infer.runner.partition = args.partition
+                cfg.infer.runner.quotatype = args.quotatype
+        else:
+            logger.warning('SlurmRunner is not used, so the partition '
+                           'argument is ignored.')
+        if args.debug:
+            cfg.infer.runner.debug = True
+        if args.lark:
+            cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
+        cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
+                                                    'predictions/')
+        partitioner = PARTITIONERS.build(cfg.infer.partitioner)
+        tasks = partitioner(cfg)
+        if args.dry_run:
+            return
+        runner = RUNNERS.build(cfg.infer.runner)
+        # Add extra attack config if exists
+        if hasattr(cfg, 'attack'):
+            for task in tasks:
+                cfg.attack.dataset = task.datasets[0][0].abbr
+                task.attack = cfg.attack
+        runner(tasks)
+    # evaluate
+    if args.mode in ['all', 'eval']:
+        # When user have specified --slurm or --dlc, or have not set
+        # "eval" in config, we will provide a default configuration
+        # for eval
+        if (args.dlc or args.slurm) and cfg.get('eval', None):
+            logger.warning('You have set "eval" in the config, but '
+                           'also specified --slurm or --dlc. '
+                           'The "eval" configuration will be overridden by '
+                           'your runtime arguments.')
+        if args.dlc or args.slurm or cfg.get('eval', None) is None:
+            fill_eval_cfg(cfg, args)
+        if args.dump_eval_details:
+            cfg.eval.runner.task.dump_details = True
+        if args.dump_extract_rate:
+            cfg.eval.runner.task.cal_extract_rate = True
+        if args.partition is not None:
+            if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner:
+                cfg.eval.runner.partition = args.partition
+                cfg.eval.runner.quotatype = args.quotatype
+            else:
+                logger.warning('SlurmRunner is not used, so the partition '
+                               'argument is ignored.')
+        if args.debug:
+            cfg.eval.runner.debug = True
+        if args.lark:
+            cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
+        cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
+        partitioner = PARTITIONERS.build(cfg.eval.partitioner)
+        tasks = partitioner(cfg)
+        if args.dry_run:
+            return
+        runner = RUNNERS.build(cfg.eval.runner)
+        # For meta-review-judge in subjective evaluation
+        if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
+                tasks[0], list):
+            for task_part in tasks:
+                runner(task_part)
+        else:
+            runner(tasks)
+    # visualize
+    if args.mode in ['all', 'eval', 'viz']:
+        summarizer_cfg = cfg.get('summarizer', {})
+        # For subjective summarizer
+        if summarizer_cfg.get('function', None):
+            main_summarizer_cfg = copy.deepcopy(summarizer_cfg)
+            grouped_datasets = {}
+            for dataset in cfg.datasets:
+                prefix = dataset['abbr'].split('_')[0]
+                if prefix not in grouped_datasets:
+                    grouped_datasets[prefix] = []
+                grouped_datasets[prefix].append(dataset)
+            all_grouped_lists = []
+            for prefix in grouped_datasets:
+                all_grouped_lists.append(grouped_datasets[prefix])
+            dataset_score_container = []
+            for dataset in all_grouped_lists:
+                temp_cfg = copy.deepcopy(cfg)
+                temp_cfg.datasets = dataset
+                summarizer_cfg = dict(type=dataset[0]['summarizer']['type'], config=temp_cfg)
+                summarizer = build_from_cfg(summarizer_cfg)
+                dataset_score = summarizer.summarize(time_str=cfg_time_str)
+                if dataset_score:
+                    dataset_score_container.append(dataset_score)
+            main_summarizer_cfg['config'] = cfg
+            main_summarizer = build_from_cfg(main_summarizer_cfg)
+            main_summarizer.summarize(time_str=cfg_time_str, subjective_scores=dataset_score_container)
+        else:
+            if not summarizer_cfg or summarizer_cfg.get('type', None) is None:
+                summarizer_cfg['type'] = DefaultSummarizer
+            summarizer_cfg['config'] = cfg
+            summarizer = build_from_cfg(summarizer_cfg)
+            summarizer.summarize(time_str=cfg_time_str)
+if __name__ == '__main__':
+    main()
--- a/opencompass/configs/dataset_collections/chat_OC15.py
+++ b/opencompass/configs/dataset_collections/chat_OC15.py
+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
+    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
+    from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import nq_datasets
+    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import winogrande_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
+    from opencompass.configs.datasets.bbh.bbh_gen_2879b0 import bbh_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets
+    from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets
+    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/README.md
+++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/README.md
+# ARC Prize Public Evaluation
+#### Overview
+The spirit of ARC Prize is to open source progress towards AGI. To win prize money, you will be required to publish reproducible code/methods into public domain.
+ARC Prize measures AGI progress using the [ARC-AGI private evaluation set](https://arcprize.org/guide#private), [the leaderboard is here](https://arcprize.org/leaderboard), and the Grand Prize is unlocked once the first team reaches [at least 85%](https://arcprize.org/guide#grand-prize-goal).
+Note: the private evaluation set imposes limitations on solutions (eg. no internet access, so no GPT-4/Claude/etc). There is a [secondary leaderboard](https://arcprize.org/leaderboard) called ARC-AGI-Pub, it measures the [public evaluation set](https://arcprize.org/guide#public-tasks) and imposes no limits but it is not part of ARC Prize 2024 at this time.
+#### Tasks
+ARC-AGI tasks are a series of three to five input and output tasks followed by a final task with only the input listed. Each task tests the utilization of a specific learned skill based on a minimal number of cognitive priors.
+![alt text](https://arcprize.org/media/images/arc-task-grids.jpg)
+Tasks are represented as JSON lists of integers. These JSON objects can also be represented visually as a grid of colors using an ARC-AGI task viewer.
+A successful submission is a pixel-perfect description (color and position) of the final task's output.
+#### Format
+As mentioned above, tasks are stored in JSON format. Each JSON file consists of two key-value pairs.
+`train`: a list of two to ten input/output pairs (typically three.) These are used for your algorithm to infer a rule.
+`test`: a list of one to three input/output pairs (typically one.) Your model should apply the inferred rule from the train set and construct an output solution. You will have access to the output test solution on the public data. The output solution on the private evaluation set will not be revealed.
+Here is an example of a simple ARC-AGI task that has three training pairs along with a single test pair. Each pair is shown as a 2x2 grid. There are four colors represented by the integers 1, 4, 6, and 8. Which actual color (red/green/blue/black) is applied to each integer is arbitrary and up to you.
+```json
+{
+  "train": [
+    {"input": [[1, 0], [0, 0]], "output": [[1, 1], [1, 1]]},
+    {"input": [[0, 0], [4, 0]], "output": [[4, 4], [4, 4]]},
+    {"input": [[0, 0], [6, 0]], "output": [[6, 6], [6, 6]]}
+  ],
+  "test": [
+    {"input": [[0, 0], [0, 8]], "output": [[8, 8], [8, 8]]}
+  ]
+}
+```
+#### Performance
+| Qwen2.5-72B-Instruct | LLaMA3.1-70B-Instruct | gemma-2-27b-it | 
+| ----- | ----- |  ----- | 
+| 0.09 | 0.06 | 0.05 |
\ No newline at end of file
--- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py
+++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py
+from mmengine.config import read_base
+with read_base():
+    from .arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets  # noqa: F401, F403
\ No newline at end of file
--- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_872059.py
+++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_872059.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.arc_prize_public_evaluation import ARCPrizeDataset, ARCPrizeEvaluator
+# The system_prompt defines the initial instructions for the model, 
+# setting the context for solving ARC tasks.
+system_prompt = '''You are a puzzle solving wizard. You are given a puzzle from the abstraction and reasoning corpus developed by Francois Chollet.'''
+# User message template is a template for creating user prompts. It includes placeholders for training data and test input data, 
+# guiding the model to learn the rule and apply it to solve the given puzzle.
+user_message_template = '''Here are the example input and output pairs from which you should learn the underlying rule to later predict the output for the given test input:
+----------------------------------------
+{training_data}
+----------------------------------------
+Now, solve the following puzzle based on its input grid by applying the rules you have learned from the training data.:
+----------------------------------------
+[{{'input': {input_test_data}, 'output': [[]]}}]
+----------------------------------------
+What is the output grid? Only provide the output grid in the form as in the example input and output pairs. Do not provide any additional information:'''
+arc_prize_public_evaluation_reader_cfg = dict(
+    input_columns=['training_data', 'input_test_data'], 
+    output_column='output_test_data'
+)
+arc_prize_public_evaluation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='SYSTEM', prompt=system_prompt),
+                dict(role='HUMAN', prompt=user_message_template),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=2048)
+)
+arc_prize_public_evaluation_eval_cfg = dict(
+    evaluator=dict(type=ARCPrizeEvaluator)
+)
+arc_prize_public_evaluation_datasets = [
+    dict(
+        abbr='ARC_Prize_Public_Evaluation',
+        type=ARCPrizeDataset,
+        path='opencompass/arc_prize_public_evaluation',
+        reader_cfg=arc_prize_public_evaluation_reader_cfg,
+        infer_cfg=arc_prize_public_evaluation_infer_cfg,
+        eval_cfg=arc_prize_public_evaluation_eval_cfg
+    )
+]
\ No newline at end of file
--- a/opencompass/configs/datasets/ARC_c/ARC_c_clean_ppl.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_clean_ppl.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccContaminationEvaluator
+from opencompass.datasets import ARCDatasetClean as ARCDataset
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'A':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textA}')
+                ], ),
+            'B':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textB}')
+                ], ),
+            'C':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textC}')
+                ], ),
+            'D':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textD}')
+                ], ),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ARC_c_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
+                      analyze_contamination=True)
+ARC_c_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-c-test',
+        path='opencompass/ai2_arc-test',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg)
+]