[Sync] Sync Internal (#941)

b03d5dc5 · Fengzhe Zhou · GitHub · bbec7d87 · b03d5dc5 · b03d5dc5
Unverified Commit b03d5dc5 authored Mar 04, 2024 by Fengzhe Zhou Committed by GitHub Mar 04, 2024
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -91,8 +91,12 @@ docs/zh_cn/_build/
 # sft config ignore list
 configs/sft_cfg/*B_*
+configs/sft_cfg/1B/*
 configs/sft_cfg/7B/*
 configs/sft_cfg/20B/*
+configs/sft_cfg/60B/*
+configs/sft_cfg/100B/*
 configs/cky/
 # in case llama clone in the opencompass
 llama/
@@ -120,3 +124,6 @@ turbomind/
 *.csv
 *.npy
 *.c
+# aliyun
+core.*
--- a/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py
+++ b/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GaokaoBenchDataset
+from mmengine.config import read_base
+with read_base():
+    from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
+GaokaoBench_datasets = []
+for folder, prompts in [
+    ("Multiple-choice_Questions", MCQ_prompts),
+    ("Fill-in-the-blank_Questions", FBQ_prompts),
+]:
+    for p in prompts:
+        reader_cfg = {
+            "input_columns": ["question"],
+            "output_column": "answer",
+        }
+        infer_cfg = {
+            "ice_template": {
+                "type": PromptTemplate,
+                "template": {"round": [{"role": "HUMAN", "prompt": p["prefix_prompt"] + "{question}"}]},
+                "ice_token": "</E>",
+            },
+            "retriever": {"type": ZeroRetriever},
+            "inferencer": {"type": GenInferencer, "max_out_len": 1024},
+        }
+        eval_cfg = {
+            "evaluator": {"type": "GaokaoBenchEvaluator" + "_" + p["type"]},
+            "pred_role": "BOT",
+        }
+        dataset = {
+            "type": GaokaoBenchDataset,
+            "abbr": "GaokaoBench_" + p["keyword"],
+            "path": os.path.join("data", "GAOKAO-BENCH", "data", folder, p["keyword"] + ".json"),
+            "reader_cfg": reader_cfg,
+            "infer_cfg": infer_cfg,
+            "eval_cfg": eval_cfg,
+        }
+        GaokaoBench_datasets.append(dataset)
--- a/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py
+++ b/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GaokaoBenchDataset
+from mmengine.config import read_base
+with read_base():
+    from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
+GaokaoBench_datasets = []
+for folder, prompts in [
+    ("Multiple-choice_Questions", MCQ_prompts),
+    ("Fill-in-the-blank_Questions", FBQ_prompts),
+]:
+    for p in prompts:
+        reader_cfg = {
+            "input_columns": ["question"],
+            "output_column": "answer",
+        }
+        infer_cfg = {
+            "prompt_template": {
+                "type": PromptTemplate,
+                "template": p["prefix_prompt"] + "{question}",
+            },
+            "retriever": {"type": ZeroRetriever},
+            "inferencer": {"type": GenInferencer, "max_out_len": 1024},
+        }
+        eval_cfg = {
+            "evaluator": {"type": "GaokaoBenchEvaluator" + "_" + p["type"]},
+            "pred_role": "BOT",
+        }
+        dataset = {
+            "type": GaokaoBenchDataset,
+            "abbr": "GaokaoBench_" + p["keyword"],
+            "path": os.path.join("data", "GAOKAO-BENCH", "data", folder, p["keyword"] + ".json"),
+            "reader_cfg": reader_cfg,
+            "infer_cfg": infer_cfg,
+            "eval_cfg": eval_cfg,
+        }
+        GaokaoBench_datasets.append(dataset)
--- a/configs/datasets/GaokaoBench/GaokaoBench_prompts.py
+++ b/configs/datasets/GaokaoBench/GaokaoBench_prompts.py
+MCQ_prompts = [
+    {
+        "type": "single_choice",
+        "keyword": "2010-2022_Math_II_MCQs",
+        "prefix_prompt": "请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+        "comment": "",
+    },
+    {
+        "type": "single_choice",
+        "keyword": "2010-2022_Math_I_MCQs",
+        "prefix_prompt": "请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+        "comment": "",
+    },
+    {
+        "type": "single_choice",
+        "keyword": "2010-2022_History_MCQs",
+        "prefix_prompt": "请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+    },
+    {
+        "type": "single_choice",
+        "keyword": "2010-2022_Biology_MCQs",
+        "prefix_prompt": "请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+    },
+    {
+        "type": "single_choice",
+        "keyword": "2010-2022_Political_Science_MCQs",
+        "prefix_prompt": "请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+    },
+    {
+        "type": "multi_choice",
+        "keyword": "2010-2022_Physics_MCQs",
+        "prefix_prompt": "请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出所有符合题意的答案，并写在【答案】和<eoa>之间。\n例如：【答案】 AB <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n",
+    },
+    {
+        "type": "single_choice",
+        "keyword": "2010-2022_Chemistry_MCQs",
+        "prefix_prompt": "请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+    },
+    {
+        "type": "single_choice",
+        "keyword": "2010-2013_English_MCQs",
+        "prefix_prompt": "请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：",
+    },
+    {
+        "type": "multi_question_choice",
+        "keyword": "2010-2022_Chinese_Modern_Lit",
+        "prefix_prompt": "请你做一道语文阅读理解题，其中包含三个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n",
+    },
+    {
+        "type": "multi_question_choice",
+        "keyword": "2010-2022_English_Fill_in_Blanks",
+        "prefix_prompt": "请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n",
+    },
+    {
+        "type": "five_out_of_seven",
+        "keyword": "2012-2022_English_Cloze_Test",
+        "prefix_prompt": "请回答下面的问题，将符合题意的五个选项的字母写在【答案】和<eoa>之间，例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n",
+    },
+    {
+        "type": "multi_question_choice",
+        "keyword": "2010-2022_Geography_MCQs",
+        "prefix_prompt": "请你做一道地理选择题，其中包含两到三个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n",
+    },
+    {
+        "type": "multi_question_choice",
+        "keyword": "2010-2022_English_Reading_Comp",
+        "prefix_prompt": "请你做一道英语阅读理解题，其中包含三到五个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n",
+    },
+    {
+        "type": "multi_question_choice",
+        "keyword": "2010-2022_Chinese_Lang_and_Usage_MCQs",
+        "prefix_prompt": "请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n（1）【解析】 ... <eoe>\n【答案】 ... <eoa>\n（2）【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答\n题目如下：",
+    },
+]
+FBQ_prompts = [
+    {
+        "type": "cloze",
+        "keyword": "2010-2022_Math_I_Fill-in-the-Blank",
+        "prefix_prompt": "请解答下面的数学填空题\n仔细阅读题目，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "cloze",
+        "keyword": "2010-2022_Math_II_Fill-in-the-Blank",
+        "prefix_prompt": "请解答下面的数学填空题\n仔细阅读题目，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "cloze",
+        "keyword": "2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation",
+        "prefix_prompt": "请回答下面的语文填空题\n请你仔细阅读题目，先找到题目对应的中国名篇，再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间，将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n（1）【解析】 ...<eoe>\n【答案】...<eoa>\n（2）【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "cloze",
+        "keyword": "2014-2022_English_Language_Cloze_Passage",
+        "prefix_prompt": "请回答下面的英语短文填词题\n仔细阅读题目，空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考，将思考过程写在【解析】和<eoe>之间，将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n（1）【解析】 ...<eoe>\n【答案】...<eoa>\n（2）【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+]
+OEQ_prompts = [
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Geography_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下：",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Chemistry_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Math_I_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间，答案需要有完整的解题步骤。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_History_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的历史解答题\n仔细阅读材料和题目，并充分结合你已有的知识，解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Biology_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Math_II_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间，答案需要有完整的解题步骤。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Physics_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的物理解答题，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Political_Science_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的政治解答题\n仔细阅读材料和题目，并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "correction",
+        "keyword": "2012-2022_English_Language_Error_Correction",
+        "prefix_prompt": "请解答下面的英语短文改错题，仔细阅读题目并充分结合你你已有的知识，找出其中10处需要改动的地方。请你一步步思考，把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下：【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
+        # "prefix_prompt": [
+        #     "请解答下面的英语短文改错题，仔细阅读题目并充分结合你你已有的知识，找出其中10处需要改动的地方。请你一步步思考，把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下：【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
+        #     "请比较下面两篇短文，找到第二篇和第一篇的10处不同，每处不同只涉及一个单词，请将结果写在【答案】和<eoa>之间。例如：【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下：【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
+        # ],
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Chinese_Language_Ancient_Poetry_Reading",
+        "prefix_prompt": "请解答下面的语文古代诗歌阅读题，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Chinese_Language_Practical_Text_Reading",
+        "prefix_prompt": "请解答下面的语文实用类文本阅读，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Chinese_Language_Literary_Text_Reading",
+        "prefix_prompt": "请解答下面的语文文学类文本阅读，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Chinese_Language_Classical_Chinese_Reading",
+        "prefix_prompt": "请解答下面的语文文言文阅读，仔细阅读题目，前三题是单选题，最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面，例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+    {
+        "type": "subjective",
+        "keyword": "2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions",
+        "prefix_prompt": "请解答下面的语文解答题，仔细阅读题目，注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答。\n题目如下:",
+        "comment": "",
+    },
+]
--- a/configs/datasets/MathBench/mathbench_agent_gen_fbe13b.py
+++ b/configs/datasets/MathBench/mathbench_agent_gen_fbe13b.py
--- a/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py
+++ b/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py
@@ -4,37 +4,36 @@ from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
-TheoremQA_reader_cfg = dict(
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
-    input_columns=['Question', 'Answer_type'],
-    output_column='Answer',
-    train_split='test')
-TheoremQA_prompt1 = "Please read a math problem, and then think step by step to derive the answer. The answer is decided by Answer Type. " \
+TheoremQA_prompt1 = (
-         "If the Answer type in [bool], the answer needs to be True or False. " \
+    "Please read a math problem, and then think step by step to derive the answer. The answer is decided by Answer Type. "
-         "Else if the Answer type in [integer, float] , The answer needs to be in numerical form. " \
+    "If the Answer type in [bool], the answer needs to be True or False. "
-         "Else if the Answer type in [list of integer, list of float] , the answer needs to be a list of number like [2, 3, 4]. " \
+    "Else if the Answer type in [integer, float] , The answer needs to be in numerical form. "
-         "Else if the Answer type in [option], the answer needs to be an option like (a), (b), (c), (d)." \
+    "Else if the Answer type in [list of integer, list of float] , the answer needs to be a list of number like [2, 3, 4]. "
+    "Else if the Answer type in [option], the answer needs to be an option like (a), (b), (c), (d)."
    "You need to output the answer in your final sentence like 'Therefore, the answer is ...'."
-TheoremQA_prompt2 = f"Below is an instruction that describes a task, paired with an input that provides further context. " \
+)
+TheoremQA_prompt2 = (
+    f"Below is an instruction that describes a task, paired with an input that provides further context. "
    f"Write a response that appropriately completes the request.\n\n### Instruction:\n{TheoremQA_prompt1}\n\n### Input:\n{{Question}}\nAnswer_type:{{Answer_type}}\n### Response:\n"
+)
 TheoremQA_infer_cfg = dict(
-    prompt_template=dict(
+    prompt_template=dict(type=PromptTemplate, template=TheoremQA_prompt2),
-        type=PromptTemplate,
-        template=TheoremQA_prompt2),
    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
-TheoremQA_eval_cfg = dict(
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess))
-    evaluator=dict(type=AccEvaluator),
-    pred_postprocessor=dict(type=TheoremQA_postprocess))
 TheoremQA_datasets = [
    dict(
-        abbr='TheoremQA',
+        abbr="TheoremQA",
        type=TheoremQADataset,
        path="./data/TheoremQA/test.csv",
        reader_cfg=TheoremQA_reader_cfg,
        infer_cfg=TheoremQA_infer_cfg,
-        eval_cfg=TheoremQA_eval_cfg)
+        eval_cfg=TheoremQA_eval_cfg,
+    )
 ]
--- a/configs/datasets/TheoremQA/TheoremQA_gen_7009de.py
+++ b/configs/datasets/TheoremQA/TheoremQA_gen_7009de.py
@@ -4,10 +4,7 @@ from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
-TheoremQA_reader_cfg = dict(
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
-    input_columns=['Question', 'Answer_type'],
-    output_column='Answer',
-    train_split='test')
 TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
 1. a numerical value like 0.1, no symbol and no unit at all.
@@ -15,34 +12,33 @@ TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the g
 3. True/False.
 4. an option like (a), (b), (c), (d)
 """
-TheoremQA_prompt2 = 'Question: {Question}\nLet\'s think step by step.'
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
 TheoremQA_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            begin=[
-                dict(
+                dict(role="SYSTEM", fallback_role="HUMAN", prompt=TheoremQA_prompt1),
-                    role='SYSTEM',
-                    fallback_role='HUMAN',
-                    prompt=TheoremQA_prompt1),
            ],
            round=[
-                dict(role='HUMAN', prompt=TheoremQA_prompt2),
+                dict(role="HUMAN", prompt=TheoremQA_prompt2),
-            ])),
+            ],
+        ),
+    ),
    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
-TheoremQA_eval_cfg = dict(
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess))
-    evaluator=dict(type=AccEvaluator),
-    pred_postprocessor=dict(type=TheoremQA_postprocess))
 TheoremQA_datasets = [
    dict(
-        abbr='TheoremQA',
+        abbr="TheoremQA",
        type=TheoremQADataset,
        path="./data/TheoremQA/test.csv",
        reader_cfg=TheoremQA_reader_cfg,
        infer_cfg=TheoremQA_infer_cfg,
-        eval_cfg=TheoremQA_eval_cfg)
+        eval_cfg=TheoremQA_eval_cfg,
+    )
 ]
--- a/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py
+++ b/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py
@@ -4,34 +4,41 @@ from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
-TheoremQA_reader_cfg = dict(
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
-    input_columns=['Question', 'Answer_type'],
-    output_column='Answer',
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
-    train_split='test')
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
 TheoremQA_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
-        template=dict(round=[
+        template=dict(
+            round=[
                dict(
-                role='HUMAN',
+                    role="HUMAN",
-                prompt=
+                    prompt=TheoremQA_prompt1 + TheoremQA_prompt2,
-                """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:\n1. a numerical value like 0.1, no symbol and no unit at all.\n2. a list of number like [2, 3, 4].\n3. True/False.\n4. an option like (a), (b), (c), (d)\nQuestion: {Question}\nLet\'s think step by step."""
+                ),
+            ]
+        ),
    ),
-        ])),
    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
-TheoremQA_eval_cfg = dict(
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess))
-    evaluator=dict(type=AccEvaluator),
-    pred_postprocessor=dict(type=TheoremQA_postprocess))
 TheoremQA_datasets = [
    dict(
-        abbr='TheoremQA',
+        abbr="TheoremQA",
        type=TheoremQADataset,
        path="./data/TheoremQA/test.csv",
        reader_cfg=TheoremQA_reader_cfg,
        infer_cfg=TheoremQA_infer_cfg,
-        eval_cfg=TheoremQA_eval_cfg)
+        eval_cfg=TheoremQA_eval_cfg,
+    )
 ]
--- a/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_2c2583.py
+++ b/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_2c2583.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess_v2
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=TheoremQA_prompt1 + TheoremQA_prompt2,
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+# 正确的 evaluator 需要借助于 llm 来进行答案提取，此评测逻辑亦会有较多 FN 。
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess_v2))
+TheoremQA_datasets = [
+    dict(
+        abbr="TheoremQA",
+        type=TheoremQADataset,
+        path="./data/TheoremQA/test.csv",
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]
--- a/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_ef26ca.py
+++ b/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_ef26ca.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess_v2
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role="HUMAN",
+                    prompt=TheoremQA_prompt1 + TheoremQA_prompt2,
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+# 正确的 evaluator 需要借助于 llm 来进行答案提取，此评测逻辑亦会有较多 FN 。
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess_v2))
+TheoremQA_datasets = [
+    dict(
+        abbr="TheoremQA",
+        type=TheoremQADataset,
+        path="./data/TheoremQA/test.csv",
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]
--- a/configs/datasets/bbh/bbh_gen_0a5495.py
+++ b/configs/datasets/bbh/bbh_gen_0a5495.py
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
+bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=BBHEvaluator_mcq),
+        pred_role="BOT",
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path=f"./data/BBH/data",
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+for _name in bbh_free_form_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role="BOT")
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path=f"./data/BBH/data",
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
--- a/configs/datasets/ceval/ceval_ppl_1cd8bf.py
+++ b/configs/datasets/ceval/ceval_ppl_1cd8bf.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CEvalDataset
+ceval_subject_mapping = {
+    'computer_network': ['Computer Network', '计算机网络', 'STEM'],
+    'operating_system': ['Operating System', '操作系统', 'STEM'],
+    'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
+    'college_programming': ['College Programming', '大学编程', 'STEM'],
+    'college_physics': ['College Physics', '大学物理', 'STEM'],
+    'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
+    'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
+    'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
+    'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
+    'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
+    'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
+    'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
+    'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
+    'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
+    'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
+    'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
+    'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
+    'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
+    'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
+    'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
+    'college_economics': ['College Economics', '大学经济学', 'Social Science'],
+    'business_administration': ['Business Administration', '工商管理', 'Social Science'],
+    'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
+    'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
+    'education_science': ['Education Science', '教育学', 'Social Science'],
+    'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
+    'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
+    'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
+    'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
+    'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
+    'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
+    'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
+    'logic': ['Logic', '逻辑学', 'Humanities'],
+    'law': ['Law', '法学', 'Humanities'],
+    'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
+    'art_studies': ['Art Studies', '艺术学', 'Humanities'],
+    'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
+    'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
+    'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
+    'high_school_history': ['High School History', '高中历史', 'Humanities'],
+    'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
+    'civil_servant': ['Civil Servant', '公务员', 'Other'],
+    'sports_science': ['Sports Science', '体育学', 'Other'],
+    'plant_protection': ['Plant Protection', '植物保护', 'Other'],
+    'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
+    'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
+    'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
+    'accountant': ['Accountant', '注册会计师', 'Other'],
+    'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
+    'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
+    'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
+    'physician': ['Physician', '医师资格', 'Other'],
+}
+ceval_all_sets = list(ceval_subject_mapping.keys())
+ceval_datasets = []
+for _split in ["val", "test"]:
+    for _name in ceval_all_sets:
+        ceval_reader_cfg = dict(
+            input_columns=["question", "A", "B", "C", "D"],
+            output_column="answer",
+            train_split="dev",
+            test_split=_split,
+        )
+        _ch_name = ceval_subject_mapping[_name][1]
+        hint = f"以下是关于{_ch_name}的单项选择题，请直接给出正确答案的选项。"
+        question_and_options = "{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
+        ceval_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template={answer: f"{question_and_options}\n答案: {answer}\n" for answer in ["A", "B", "C", "D"]},
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template={answer: f"{hint}\n</E>{question_and_options}\n答案: {answer}" for answer in ["A", "B", "C", "D"]},
+                ice_token="</E>",
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+            inferencer=dict(type=PPLInferencer),
+        )
+        ceval_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+        ceval_datasets.append(
+            dict(
+                type=CEvalDataset,
+                path="./data/ceval/formal_ceval",
+                name=_name,
+                abbr="ceval-" + _name if _split == "val" else "ceval-test-" + _name,
+                reader_cfg=ceval_reader_cfg,
+                infer_cfg=ceval_infer_cfg,
+                eval_cfg=ceval_eval_cfg,
+            ))
--- a/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py
+++ b/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+cmmlu_subject_mapping = {
+    'agronomy': '农学',
+    'anatomy': '解剖学',
+    'ancient_chinese': '古汉语',
+    'arts': '艺术学',
+    'astronomy': '天文学',
+    'business_ethics': '商业伦理',
+    'chinese_civil_service_exam': '中国公务员考试',
+    'chinese_driving_rule': '中国驾驶规则',
+    'chinese_food_culture': '中国饮食文化',
+    'chinese_foreign_policy': '中国外交政策',
+    'chinese_history': '中国历史',
+    'chinese_literature': '中国文学',
+    'chinese_teacher_qualification': '中国教师资格',
+    'clinical_knowledge': '临床知识',
+    'college_actuarial_science': '大学精算学',
+    'college_education': '大学教育学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_law': '大学法律',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'college_medicine': '大学医学',
+    'computer_science': '计算机科学',
+    'computer_security': '计算机安全',
+    'conceptual_physics': '概念物理学',
+    'construction_project_management': '建设工程管理',
+    'economics': '经济学',
+    'education': '教育学',
+    'electrical_engineering': '电气工程',
+    'elementary_chinese': '小学语文',
+    'elementary_commonsense': '小学常识',
+    'elementary_information_and_technology': '小学信息技术',
+    'elementary_mathematics': '初等数学',
+    'ethnology': '民族学',
+    'food_science': '食品科学',
+    'genetics': '遗传学',
+    'global_facts': '全球事实',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_geography': '高中地理',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'high_school_politics': '高中政治',
+    'human_sexuality': '人类性行为',
+    'international_law': '国际法学',
+    'journalism': '新闻学',
+    'jurisprudence': '法理学',
+    'legal_and_moral_basis': '法律与道德基础',
+    'logical': '逻辑学',
+    'machine_learning': '机器学习',
+    'management': '管理学',
+    'marketing': '市场营销',
+    'marxist_theory': '马克思主义理论',
+    'modern_chinese': '现代汉语',
+    'nutrition': '营养学',
+    'philosophy': '哲学',
+    'professional_accounting': '专业会计',
+    'professional_law': '专业法学',
+    'professional_medicine': '专业医学',
+    'professional_psychology': '专业心理学',
+    'public_relations': '公共关系',
+    'security_study': '安全研究',
+    'sociology': '社会学',
+    'sports_science': '体育学',
+    'traditional_chinese_medicine': '中医中药',
+    'virology': '病毒学',
+    'world_history': '世界历史',
+    'world_religions': '世界宗教'
+}
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    hint = f"以下是关于{_ch_name}的单项选择题，请直接给出正确答案的选项。"
+    question_and_options = "题目：{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
+    cmmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template={answer: f"{question_and_options}\n答案是: {answer}\n" for answer in ["A", "B", "C", "D"]},
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template={answer: f"{hint}\n</E>{question_and_options}\n答案是: {answer}" for answer in ["A", "B", "C", "D"]},
+            ice_token="</E>",
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=PPLInferencer),
+    )
+    cmmlu_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path="./data/cmmlu/",
+            name=_name,
+            abbr=f"cmmlu-{_name}",
+            reader_cfg=dict(
+                input_columns=["question", "A", "B", "C", "D"],
+                output_column="answer",
+                train_split="dev",
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+        ))
+del _name, _ch_name
--- a/configs/datasets/collections/base_core.py
+++ b/configs/datasets/collections/base_core.py
+from mmengine.config import read_base
+with read_base():
+    from ..mmlu.mmlu_ppl_ac766d import mmlu_datasets
+    from ..cmmlu.cmmlu_ppl_041cbf import cmmlu_datasets
+    from ..ceval.ceval_ppl_1cd8bf import ceval_datasets
+    from ..GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import GaokaoBench_datasets
+    from ..triviaqa.triviaqa_wiki_1shot_gen_20a989 import triviaqa_datasets
+    from ..nq.nq_open_1shot_gen_20a989 import nq_datasets
+    from ..race.race_ppl_abed12 import race_datasets
+    from ..winogrande.winogrande_5shot_ll_9d81d7 import winogrande_datasets
+    from ..hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets
+    from ..bbh.bbh_gen_0a5495 import bbh_datasets
+    from ..gsm8k.gsm8k_gen_ee684f import gsm8k_datasets
+    from ..math.math_evaluatorv2_gen_9d2049 import math_datasets
+    from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets
+    from ..humaneval.humaneval_gen_d2537e import humaneval_datasets
+    from ..mbpp.sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets
+datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])
--- a/configs/datasets/collections/chat_core.py
+++ b/configs/datasets/collections/chat_core.py
+from mmengine.config import read_base
+with read_base():
+    from ..mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from ..cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
+    from ..ceval.ceval_internal_gen_2daf24 import ceval_datasets
+    from ..GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
+    from ..triviaqa.triviaqa_wiki_1shot_gen_eaf81e import triviaqa_datasets
+    from ..nq.nq_open_1shot_gen_01cf41 import nq_datasets
+    from ..race.race_gen_69ee4f import race_datasets
+    from ..winogrande.winogrande_5shot_gen_6447e6 import winogrande_datasets
+    from ..hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
+    from ..bbh.bbh_gen_5b92b0 import bbh_datasets
+    from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ..math.math_evaluatorv2_gen_265cce import math_datasets
+    from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets
+    from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from ..mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
+datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])
--- a/configs/datasets/gsm8k/gsm8k_agent_gen_be1606.py
+++ b/configs/datasets/gsm8k/gsm8k_agent_gen_be1606.py
--- a/configs/datasets/gsm8k/gsm8k_gen_ee684f.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_ee684f.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
+gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='''\
+Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?
+Let's think step by step
+Answer:
+Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.
+For the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.
+Angelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.
+However, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.
+They also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.
+And they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.
+So Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.
+They want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75
+They will need to plan to study 4 days to allow for all the time they need.
+The answer is 4
+Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?
+Let's think step by step
+Answer:
+Mark's team scores 25 2 pointers, meaning they scored 25*2 = 50 points in 2 pointers.
+His team also scores 6 3 pointers, meaning they scored 8*3 = 24 points in 3 pointers
+They scored 10 free throws, and free throws count as one point so they scored 10*1 = 10 points in free throws.
+All together his team scored 50+24+10 = 84 points
+Mark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2 = 100 points in 2 pointers.
+His opponents scored half his team's number of 3 pointers, meaning they scored 24/2 = 12 points in 3 pointers.
+They also scored half Mark's team's points in free throws, meaning they scored 10/2 = 5 points in free throws.
+All together Mark's opponents scored 100+12+5 = 117 points
+The total score for the game is both team's scores added together, so it is 84+117 = 201 points
+The answer is 201
+Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?
+Let's think step by step
+Answer:
+When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24
+The total number of marbles she'll have is 60+24 = 84
+If Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.
+If Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.
+The total number of frisbees she'll have will increase to 30+12 = 42
+Bella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards
+If she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.
+The total number of deck cards she'll have is 10+4 = 14
+Together, Bella will have a total of 14+42+84 = 140 items
+The answer is 140
+Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?
+Let's think step by step
+Answer:
+For the first three baskets, the number of apples and oranges in one basket is 9+15 = 24
+In total, together with bananas, the number of fruits in one basket is 24+14 = 38 for the first three baskets.
+Since there are three baskets each having 38 fruits, there are 3*38 = 114 fruits in the first three baskets.
+The number of apples in the fourth basket is 9-2 = 7
+There are also 15-2 = 13 oranges in the fourth basket
+The combined number of oranges and apples in the fourth basket is 13+7 = 20
+The fourth basket also contains 14-2 = 12 bananas.
+In total, the fourth basket has 20+12 = 32 fruits.
+The four baskets together have 32+114 = 146 fruits.
+The answer is 146
+Question: {question}
+Let's think step by step
+Answer:
+'''),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+gsm8k_eval_cfg = dict(
+    evaluator=dict(type=Gsm8kEvaluator),
+    pred_postprocessor=dict(type=gsm8k_postprocess),
+    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
+gsm8k_datasets = [
+    dict(
+        abbr='gsm8k',
+        type=GSM8KDataset,
+        path='./data/gsm8k',
+        reader_cfg=gsm8k_reader_cfg,
+        infer_cfg=gsm8k_infer_cfg,
+        eval_cfg=gsm8k_eval_cfg)
+]
--- a/configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py
+++ b/configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import hellaswagDatasetwithICE
+from opencompass.utils.text_postprocessors import first_option_postprocess
+hellaswag_reader_cfg = dict(
+    input_columns=["ctx", "A", "B", "C", "D"],
+    output_column="label",
+    train_split="train",
+    test_split="val",
+)
+hellaswag_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role="HUMAN", prompt=f"{{ctx}}\nA) {{A}}\nB) {{B}}\nC) {{C}}\nD) {{D}}\nWhat is the right option?"),
+                dict(role="BOT", prompt="{label}\n"),
+            ]
+        ),
+    ),
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role="HUMAN", prompt="Continue the following text without adding any additional information or formatting:\n"),
+                "</E>",
+            ],
+            round=[
+                dict(role="HUMAN", prompt=f"{{ctx}}\nA) {{A}}\nB) {{B}}\nC) {{C}}\nD) {{D}}\nWhat is the right option?"),
+                dict(role="BOT", prompt="{label}\n"),
+            ],
+        ),
+        ice_token="</E>",
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=list(range(10))),
+    inferencer=dict(type=GenInferencer),
+)
+hellaswag_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role="BOT",
+    pred_postprocessor=dict(type=first_option_postprocess, options="ABCD"),
+)
+hellaswag_datasets = [
+    dict(
+        abbr="hellaswag",
+        type=hellaswagDatasetwithICE,
+        path="./data/hellaswag/",
+        reader_cfg=hellaswag_reader_cfg,
+        infer_cfg=hellaswag_infer_cfg,
+        eval_cfg=hellaswag_eval_cfg,
+    )
+]
--- a/configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py
+++ b/configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import hellaswagDatasetwithICE
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+hellaswag_reader_cfg = dict(
+    input_columns=["ctx", "A", "B", "C", "D"],
+    output_column="label",
+    train_split="train",
+    test_split="val",
+)
+hint = "Continue the following text without adding any additional information or formatting:"
+question_and_options = "{ctx}\nA) {A}\nB) {B}\nC) {C}\nD) {D}\nWhat is the right option?"
+hellaswag_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template={answer: f'{question_and_options}\n{answer}\n' for answer in ["A", "B", "C", "D"]},
+    ),
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={answer: f"{hint}\n</E>{question_and_options}\n{answer}" for answer in ["A", "B", "C", "D"]},
+        ice_token="</E>",
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=list(range(10))),
+    inferencer=dict(type=PPLInferencer),
+)
+hellaswag_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+hellaswag_datasets = [
+    dict(
+        abbr="hellaswag",
+        type=hellaswagDatasetwithICE,
+        path="./data/hellaswag/",
+        reader_cfg=hellaswag_reader_cfg,
+        infer_cfg=hellaswag_infer_cfg,
+        eval_cfg=hellaswag_eval_cfg,
+    )
+]
--- a/configs/datasets/humaneval/humaneval_gen_d2537e.py
+++ b/configs/datasets/humaneval/humaneval_gen_d2537e.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess_v2
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='Complete the following python code:\n{prompt}',
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='./data/humaneval/human-eval-v2-20210705.jsonl',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]