Initial commit

c289ecc0 · xinghao · c289ecc0 · c289ecc0 · c289ecc0 · c289ecc0
Commit c289ecc0 authored Oct 21, 2025 by xinghao
20 changed files
--- a/opencompass/configs/dataset_collections/chat_OC15.py
+++ b/opencompass/configs/dataset_collections/chat_OC15.py
+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
+    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
+    from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import nq_datasets
+    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import winogrande_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
+    from opencompass.configs.datasets.bbh.bbh_gen_2879b0 import bbh_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets
+    from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets
+    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/README.md
+++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/README.md
+# ARC Prize Public Evaluation
+#### Overview
+The spirit of ARC Prize is to open source progress towards AGI. To win prize money, you will be required to publish reproducible code/methods into public domain.
+ARC Prize measures AGI progress using the [ARC-AGI private evaluation set](https://arcprize.org/guide#private), [the leaderboard is here](https://arcprize.org/leaderboard), and the Grand Prize is unlocked once the first team reaches [at least 85%](https://arcprize.org/guide#grand-prize-goal).
+Note: the private evaluation set imposes limitations on solutions (eg. no internet access, so no GPT-4/Claude/etc). There is a [secondary leaderboard](https://arcprize.org/leaderboard) called ARC-AGI-Pub, it measures the [public evaluation set](https://arcprize.org/guide#public-tasks) and imposes no limits but it is not part of ARC Prize 2024 at this time.
+#### Tasks
+ARC-AGI tasks are a series of three to five input and output tasks followed by a final task with only the input listed. Each task tests the utilization of a specific learned skill based on a minimal number of cognitive priors.
+![alt text](https://arcprize.org/media/images/arc-task-grids.jpg)
+Tasks are represented as JSON lists of integers. These JSON objects can also be represented visually as a grid of colors using an ARC-AGI task viewer.
+A successful submission is a pixel-perfect description (color and position) of the final task's output.
+#### Format
+As mentioned above, tasks are stored in JSON format. Each JSON file consists of two key-value pairs.
+`train`: a list of two to ten input/output pairs (typically three.) These are used for your algorithm to infer a rule.
+`test`: a list of one to three input/output pairs (typically one.) Your model should apply the inferred rule from the train set and construct an output solution. You will have access to the output test solution on the public data. The output solution on the private evaluation set will not be revealed.
+Here is an example of a simple ARC-AGI task that has three training pairs along with a single test pair. Each pair is shown as a 2x2 grid. There are four colors represented by the integers 1, 4, 6, and 8. Which actual color (red/green/blue/black) is applied to each integer is arbitrary and up to you.
+```json
+{
+  "train": [
+    {"input": [[1, 0], [0, 0]], "output": [[1, 1], [1, 1]]},
+    {"input": [[0, 0], [4, 0]], "output": [[4, 4], [4, 4]]},
+    {"input": [[0, 0], [6, 0]], "output": [[6, 6], [6, 6]]}
+  ],
+  "test": [
+    {"input": [[0, 0], [0, 8]], "output": [[8, 8], [8, 8]]}
+  ]
+}
+```
+#### Performance
+| Qwen2.5-72B-Instruct | LLaMA3.1-70B-Instruct | gemma-2-27b-it | 
+| ----- | ----- |  ----- | 
+| 0.09 | 0.06 | 0.05 |
\ No newline at end of file
--- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py
+++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py
+from mmengine.config import read_base
+with read_base():
+    from .arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets  # noqa: F401, F403
\ No newline at end of file
--- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_872059.py
+++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_872059.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.arc_prize_public_evaluation import ARCPrizeDataset, ARCPrizeEvaluator
+# The system_prompt defines the initial instructions for the model, 
+# setting the context for solving ARC tasks.
+system_prompt = '''You are a puzzle solving wizard. You are given a puzzle from the abstraction and reasoning corpus developed by Francois Chollet.'''
+# User message template is a template for creating user prompts. It includes placeholders for training data and test input data, 
+# guiding the model to learn the rule and apply it to solve the given puzzle.
+user_message_template = '''Here are the example input and output pairs from which you should learn the underlying rule to later predict the output for the given test input:
+----------------------------------------
+{training_data}
+----------------------------------------
+Now, solve the following puzzle based on its input grid by applying the rules you have learned from the training data.:
+----------------------------------------
+[{{'input': {input_test_data}, 'output': [[]]}}]
+----------------------------------------
+What is the output grid? Only provide the output grid in the form as in the example input and output pairs. Do not provide any additional information:'''
+arc_prize_public_evaluation_reader_cfg = dict(
+    input_columns=['training_data', 'input_test_data'], 
+    output_column='output_test_data'
+)
+arc_prize_public_evaluation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='SYSTEM', prompt=system_prompt),
+                dict(role='HUMAN', prompt=user_message_template),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=2048)
+)
+arc_prize_public_evaluation_eval_cfg = dict(
+    evaluator=dict(type=ARCPrizeEvaluator)
+)
+arc_prize_public_evaluation_datasets = [
+    dict(
+        abbr='ARC_Prize_Public_Evaluation',
+        type=ARCPrizeDataset,
+        path='opencompass/arc_prize_public_evaluation',
+        reader_cfg=arc_prize_public_evaluation_reader_cfg,
+        infer_cfg=arc_prize_public_evaluation_infer_cfg,
+        eval_cfg=arc_prize_public_evaluation_eval_cfg
+    )
+]
\ No newline at end of file
--- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py
+++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.arc_prize_public_evaluation import ARCPrizeDataset, ARCPrizeEvaluator
+# The system_prompt defines the initial instructions for the model, 
+# setting the context for solving ARC tasks.
+system_prompt = '''You are a puzzle solving wizard. You are given a puzzle from the abstraction and reasoning corpus developed by Francois Chollet.'''
+# User message template is a template for creating user prompts. It includes placeholders for training data and test input data, 
+# guiding the model to learn the rule and apply it to solve the given puzzle.
+user_message_template = '''Here are the example input and output pairs from which you should learn the underlying rule to later predict the output for the given test input:
+----------------------------------------
+{training_data}
+----------------------------------------
+Now, solve the following puzzle based on its input grid by applying the rules you have learned from the training data.:
+----------------------------------------
+[{{'input': {input_test_data}, 'output': [[]]}}]
+----------------------------------------
+What is the output grid? Only provide the output grid in the form as in the example input and output pairs. Do not provide any additional information:'''
+arc_prize_public_evaluation_reader_cfg = dict(
+    input_columns=['training_data', 'input_test_data'], 
+    output_column='output_test_data'
+)
+arc_prize_public_evaluation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='SYSTEM',fallback_role='HUMAN', prompt=system_prompt),
+                dict(role='HUMAN', prompt=user_message_template),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+arc_prize_public_evaluation_eval_cfg = dict(
+    evaluator=dict(type=ARCPrizeEvaluator)
+)
+arc_prize_public_evaluation_datasets = [
+    dict(
+        abbr='ARC_Prize_Public_Evaluation',
+        type=ARCPrizeDataset,
+        path='opencompass/arc_prize_public_evaluation',
+        reader_cfg=arc_prize_public_evaluation_reader_cfg,
+        infer_cfg=arc_prize_public_evaluation_infer_cfg,
+        eval_cfg=arc_prize_public_evaluation_eval_cfg
+    )
+]
\ No newline at end of file
--- a/opencompass/configs/datasets/ARC_c/ARC_c_clean_ppl.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_clean_ppl.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccContaminationEvaluator
+from opencompass.datasets import ARCDatasetClean as ARCDataset
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'A':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textA}')
+                ], ),
+            'B':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textB}')
+                ], ),
+            'C':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textC}')
+                ], ),
+            'D':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textD}')
+                ], ),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ARC_c_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
+                      analyze_contamination=True)
+ARC_c_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-c-test',
+        path='opencompass/ai2_arc-test',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg)
+]
--- a/opencompass/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess, match_answer_pattern
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+{question}
+A. {textA}
+B. {textB}
+C. {textC}
+D. {textD}
+""".strip()
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=QUERY_TEMPLATE)
+            ], ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+ARC_c_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+)
+ARC_c_datasets = [
+    dict(
+        abbr='ARC-c',
+        type=ARCDataset,
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey',
+)
+ARC_c_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin='</E>',
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:',
+                ),
+                dict(role='BOT', prompt='{answerKey}'),
+            ],
+        ),
+        ice_token='</E>',
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
+    inferencer=dict(type=GenInferencer, max_out_len=50),
+)
+ARC_c_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+ARC_c_datasets = [
+    dict(
+        abbr='ARC-c',
+        type=ARCDataset,
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/ARC_c/ARC_c_few_shot_ppl.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_few_shot_ppl.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey',
+)
+ARC_c_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template={
+            'A': dict(
+                begin='</E>',
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textA}'),
+                ],
+            ),
+            'B': dict(
+                begin='</E>',
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textB}'),
+                ],
+            ),
+            'C': dict(
+                begin='</E>',
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textC}'),
+                ],
+            ),
+            'D': dict(
+                begin='</E>',
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textD}'),
+                ],
+            ),
+        },
+        ice_token='</E>',
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
+    inferencer=dict(type=PPLInferencer),
+)
+ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+ARC_c_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-c',
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/ARC_c/ARC_c_gen.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_gen.py
+from mmengine.config import read_base
+with read_base():
+    from .ARC_c_gen_1e0de5 import ARC_c_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    'Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:'
+                )
+            ], ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+ARC_c_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+)
+ARC_c_datasets = [
+    dict(
+        abbr='ARC-c',
+        type=ARCDataset,
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/ARC_c/ARC_c_ppl.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_ppl.py
+from mmengine.config import read_base
+with read_base():
+    from .ARC_c_ppl_a450bd import ARC_c_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/ARC_c/ARC_c_ppl_2ef631.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_ppl_2ef631.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            opt: dict(
+                round=[
+                    dict(role='HUMAN', prompt=f'{{question}}\nA. {{textA}}\nB. {{textB}}\nC. {{textC}}\nD. {{textD}}'),
+                    dict(role='BOT', prompt=f'Answer: {opt}'),
+                ]
+            ) for opt in ['A', 'B', 'C', 'D']
+        },
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+ARC_c_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-c',
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg)
+]
--- a/opencompass/configs/datasets/ARC_c/ARC_c_ppl_a450bd.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_ppl_a450bd.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'A':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textA}')
+                ], ),
+            'B':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textB}')
+                ], ),
+            'C':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textC}')
+                ], ),
+            'D':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textD}')
+                ], ),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+ARC_c_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-c',
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg)
+]
--- a/opencompass/configs/datasets/ARC_c/ARC_c_ppl_d52a21.py
+++ b/opencompass/configs/datasets/ARC_c/ARC_c_ppl_d52a21.py
+from mmengine.config import read_base
+# with read_base():
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'A': 'Question: {question}\nAnswer: {textA}',
+            'B': 'Question: {question}\nAnswer: {textB}',
+            'C': 'Question: {question}\nAnswer: {textC}',
+            'D': 'Question: {question}\nAnswer: {textD}'
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+ARC_c_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-c',
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg)
+]
--- a/opencompass/configs/datasets/ARC_e/ARC_e_gen.py
+++ b/opencompass/configs/datasets/ARC_e/ARC_e_gen.py
+from mmengine.config import read_base
+with read_base():
+    from .ARC_e_gen_1e0de5 import ARC_e_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/ARC_e/ARC_e_gen_1e0de5.py
+++ b/opencompass/configs/datasets/ARC_e/ARC_e_gen_1e0de5.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess
+ARC_e_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_e_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    'Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:'
+                )
+            ], ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+ARC_e_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+)
+ARC_e_datasets = [
+    dict(
+        abbr='ARC-e',
+        type=ARCDataset,
+        path='opencompass/ai2_arc-easy-dev',
+        name='ARC-Easy',
+        reader_cfg=ARC_e_reader_cfg,
+        infer_cfg=ARC_e_infer_cfg,
+        eval_cfg=ARC_e_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/ARC_e/ARC_e_ppl.py
+++ b/opencompass/configs/datasets/ARC_e/ARC_e_ppl.py
+from mmengine.config import read_base
+with read_base():
+    from .ARC_e_ppl_a450bd import ARC_e_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/ARC_e/ARC_e_ppl_2ef631.py
+++ b/opencompass/configs/datasets/ARC_e/ARC_e_ppl_2ef631.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+ARC_e_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_e_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            opt: dict(
+                round=[
+                    dict(role='HUMAN', prompt=f'{{question}}\nA. {{textA}}\nB. {{textB}}\nC. {{textC}}\nD. {{textD}}'),
+                    dict(role='BOT', prompt=f'Answer: {opt}'),
+                ]
+            ) for opt in ['A', 'B', 'C', 'D']
+        },
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ARC_e_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+ARC_e_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-e',
+        path='opencompass/ai2_arc-easy-dev',
+        name='ARC-Easy',
+        reader_cfg=ARC_e_reader_cfg,
+        infer_cfg=ARC_e_infer_cfg,
+        eval_cfg=ARC_e_eval_cfg)
+]
--- a/opencompass/configs/datasets/ARC_e/ARC_e_ppl_a450bd.py
+++ b/opencompass/configs/datasets/ARC_e/ARC_e_ppl_a450bd.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+ARC_e_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_e_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'A':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textA}')
+                ], ),
+            'B':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textB}')
+                ], ),
+            'C':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textC}')
+                ], ),
+            'D':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textD}')
+                ], ),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ARC_e_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+ARC_e_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-e',
+        path='opencompass/ai2_arc-easy-dev',
+        name='ARC-Easy',
+        reader_cfg=ARC_e_reader_cfg,
+        infer_cfg=ARC_e_infer_cfg,
+        eval_cfg=ARC_e_eval_cfg)
+]