Add release contribution

c94cc943 · Leymore · gaotong · e6b5bdcb · c94cc943 · c94cc943
Commit c94cc943 authored Jul 05, 2023 by Leymore Committed by gaotong Jul 05, 2023
20 changed files
--- a/configs/datasets/glm/ceval.py
+++ b/configs/datasets/glm/ceval.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GLMChoiceInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CEvalDataset
+
+ceval_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D'],
+    output_column='answer',
+    train_split='dev',
+    test_split="val")
+
+ceval_prompt_template = dict(
+    type=PromptTemplate,
+    template=None,
+    ice_token='</E>',
+)
+
+ceval_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template={
+            answer:
+            f'{{question}}\n(A) {{/A}}\n(B) {{/B}}\n(C) {{/C}}\n(D) {{/D}}\n答案: ({answer}) {{{answer}}}\n'
+            for answer in ['A', 'B', 'C', 'D']
+        }),
+    prompt_template=ceval_prompt_template,
+    retriever=dict(type=FixKRetriever),
+    inferencer=dict(type=GLMChoiceInferencer, fix_id_list=[0, 1, 2, 3, 4]))
+
+ceval_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+ceval_all_sets = [
+    "操作系统",
+    "初中地理",
+    "初中化学",
+    "初中历史",
+    "初中生物",
+    "初中数学",
+    "初中物理",
+    "初中政治",
+    "大学编程",
+    "大学化学",
+    "大学经济学",
+    "大学物理",
+    "大学中国史",
+    "导游资格",
+    "法律职业资格",
+    "法学",
+    "概率统计",
+    "高等数学",
+    "高中地理",
+    "高中化学",
+    "高中历史",
+    "高中生物",
+    "高中数学",
+    "高中物理",
+    "高中语文",
+    "高中政治",
+    "公务员",
+    "工商管理",
+    "环境影响评价工程师",
+    "基础医学",
+    "计算机网络",
+    "计算机组成",
+    "教师资格",
+    "教育学",
+    "离散数学",
+    "临床医学",
+    "逻辑学",
+    "马克思主义基本原理",
+    "毛泽东思想和中国特色社会主义理论体系概论",
+    "兽医学",
+    "税务师",
+    "思想道德修养与法律基础",
+    "体育学",
+    "医师资格",
+    "艺术学",
+    "植物保护",
+    "中国语言文学",
+    "注册城乡规划师",
+    "注册电气工程师",
+    "注册会计师",
+    "注册计量师",
+    "注册消防工程师",
+]
+
+ceval_datasets = []
+for _name in ceval_all_sets:
+    ceval_datasets.append(
+        dict(
+            type=CEvalDataset,
+            path="./data/ceval/release_ceval",
+            name=_name,
+            abbr='ceval-' + _name,
+            reader_cfg=ceval_reader_cfg,
+            infer_cfg=ceval_infer_cfg.copy(),
+            eval_cfg=ceval_eval_cfg.copy()))
+
+    ceval_datasets[-1]['infer_cfg'][
+        'prompt_template'] = ceval_prompt_template.copy()
+    ceval_datasets[-1]['infer_cfg']['prompt_template']['template'] = dict(
+        begin=[
+            dict(
+                role='SYSTEM',
+                fallback_role='HUMAN',
+                prompt=f'以下是中国关于{_name}考试的单项选择题，请选出其中的正确答案。'),
+            '</E>',
+        ],
+        round=[
+            dict(
+                role='HUMAN',
+                prompt=
+                '{question}\n(A) {A}\n(B) {B}\n(C) {C}\n(D) {D}\答案: ('),
+        ],
+    )
+
+del _name
--- a/configs/datasets/glm/mmlu.py
+++ b/configs/datasets/glm/mmlu.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GLMChoiceInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='validation')
+
+mmlu_prompt_template = dict(
+    type=PromptTemplate,
+    template=None,
+    column_token_map={
+        'input': '</input>',
+        'A': '</A>',
+        'B': '</B>',
+        'C': '</C>',
+        'D': '</D>',
+        'target': '</target>'
+    },
+    ice_token='</E>',
+)
+
+mmlu_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template={
+            target: '</input>\n(A) </A>\n(B) </B>\n(C) </C>\n(D) </D>\n'
+            f'Answer: ({target}) </{target}>\n'
+            for target in ['A', 'B', 'C', 'D']
+        },
+        column_token_map={
+            'input': '</input>',
+            'A': '</A>',
+            'B': '</B>',
+            'C': '</C>',
+            'D': '</D>',
+            'target': '</target>'
+        }),
+    prompt_template=mmlu_prompt_template,
+    retriever=dict(type=FixKRetriever),
+    inferencer=dict(type=GLMChoiceInferencer, fix_id_list=[0, 1, 2, 3, 4]))
+
+mmlu_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+mmlu_all_sets = [
+    "college_biology",
+    #  "college_chemistry",
+    #  "college_computer_science",
+    #  "college_mathematics",
+    #  "college_physics",
+    #  "electrical_engineering",
+    #  "astronomy",
+    #  "anatomy",
+    #  "abstract_algebra",
+    #  "machine_learning",
+    #  "clinical_knowledge",
+    #  "global_facts",
+    #  "management",
+    #  "nutrition",
+    #  "marketing",
+    #  "professional_accounting",
+    #  "high_school_geography",
+    #  "international_law",
+    #  "moral_scenarios",
+    #  "computer_security",
+    #  "high_school_microeconomics",
+    #  "professional_law",
+    #  "medical_genetics",
+    #  "professional_psychology",
+    #  "jurisprudence",
+    #  "world_religions",
+    #  "philosophy",
+    #  "virology",
+    #  "high_school_chemistry",
+    #  "public_relations",
+    #  "high_school_macroeconomics",
+    #  "human_sexuality",
+    #  "elementary_mathematics",
+    #  "high_school_physics",
+    #  "high_school_computer_science",
+    #  "high_school_european_history",
+    #  "business_ethics",
+    #  "moral_disputes",
+    #  "high_school_statistics",
+    #  "miscellaneous",
+    #  "formal_logic",
+    #  "high_school_government_and_politics",
+    #  "prehistory",
+    #  "security_studies",
+    #  "high_school_biology",
+    #  "logical_fallacies",
+    #  "high_school_world_history",
+    #  "professional_medicine",
+    #  "high_school_mathematics",
+    #  "college_medicine",
+    #  "high_school_us_history",
+    #  "sociology",
+    #  "econometrics",
+    #  "high_school_psychology",
+    #  "human_aging",
+    #  "us_foreign_policy",
+    #  "conceptual_physics",
+]
+
+mmlu_key_sets = [
+    'college_biology',
+    'college_chemistry',
+    'college_computer_science',
+    'college_mathematics',
+    'college_physics',
+    'electrical_engineering',
+    'astronomy',
+    'anatomy',
+    'abstract_algebra',
+    'machine_learning',
+    'clinical_knowledge',
+    'global_facts',
+    'management',
+    'nutrition',
+    'marketing',
+    'professional_accounting',
+]
+
+mmlu_datasets = []
+for name in mmlu_all_sets:
+    mmlu_datasets.append(
+        dict(
+            type=HFDataset,
+            path='lukaemon/mmlu',
+            name=name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg.copy(),
+            eval_cfg=mmlu_eval_cfg))
+    mmlu_datasets[-1]['infer_cfg'][
+        'prompt_template'] = mmlu_prompt_template.copy()
+    mmlu_datasets[-1]['infer_cfg']['prompt_template']['template'] = dict(
+        begin=[
+            dict(
+                role='SYSTEM',
+                fallback_role='HUMAN',
+                prompt=
+                f'The following are multiple choice questions (with answers) about {name.replace("_", " ")}.'
+            ),
+            '</E>',
+        ],
+        round=[
+            dict(
+                role='HUMAN',
+                prompt=
+                '</input>\n(A) </A>\n(B) </B>\n(C) </C>\n(D) </D>\nAnswer: ('),
+        ],
+    )
--- a/configs/datasets/govrepcrs/govrepcrs_gen_7643d5.py
+++ b/configs/datasets/govrepcrs/govrepcrs_gen_7643d5.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import BleuEvaluator
+from opencompass.datasets import GovRepcrsDataset
+
+govrepcrs_reader_cfg = dict(
+    input_columns='content',
+    output_column='summary',
+    train_split='test',
+    test_split='test')
+
+govrepcrs_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=
+        "Please summarize the following English report in English:{content}\n{summary}."),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(
+        type=GenInferencer, batch_size=4, max_out_len=500, max_seq_len=8192))
+
+govrepcrs_eval_cfg = dict(
+    evaluator=dict(type=BleuEvaluator),
+    pred_postprocessor=dict(type='general_cn'),
+    dataset_postprocessor=dict(type='general_cn'))
+
+govrepcrs_datasets = [
+    dict(
+        type=GovRepcrsDataset,
+        path='./data/govrep/',
+        abbr='GovRepcrs',
+        reader_cfg=govrepcrs_reader_cfg,
+        infer_cfg=govrepcrs_infer_cfg,
+        eval_cfg=govrepcrs_eval_cfg)
+]
--- a/configs/datasets/gsm8k/gsm8k_gen.py
+++ b/configs/datasets/gsm8k/gsm8k_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .gsm8k_gen_2dd372 import gsm8k_datasets  # noqa: F401, F403
--- a/configs/datasets/gsm8k/gsm8k_gen_2dd372.py
+++ b/configs/datasets/gsm8k/gsm8k_gen_2dd372.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+
+gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt="Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAnswer:"),
+                dict(role='BOT', prompt="Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n"),
+                dict(role='HUMAN', prompt="Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws.  Their opponents score double the 2 pointers but half the 3 pointers and free throws.  What's the total number of points scored by both teams added together?\nLet's think step by step\nAnswer:"),
+                dict(role='BOT', prompt="Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n"),
+                dict(role='HUMAN', prompt="Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\nAnswer:"),
+                dict(role='BOT', prompt="When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n"),
+                dict(role='HUMAN', prompt="Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\nAnswer:"),
+                dict(role='BOT', prompt="For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n"),
+                dict(role='HUMAN', prompt="Question: {question}\nLet's think step by step\nAnswer:"),
+            ],
+        )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator),
+                      pred_postprocessor=dict(type='gsm8k'),
+                      dataset_postprocessor=dict(type='gsm8k_dataset'))
+
+gsm8k_datasets = [
+    dict(
+        abbr='gsm8k',
+        type=HFDataset,
+        path='gsm8k',
+        name='main',
+        reader_cfg=gsm8k_reader_cfg,
+        infer_cfg=gsm8k_infer_cfg,
+        eval_cfg=gsm8k_eval_cfg)
+]
--- a/configs/datasets/humaneval/humaneval_gen.py
+++ b/configs/datasets/humaneval/humaneval_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .humaneval_gen_d428f1 import humaneval_datasets  # noqa: F401, F403
--- a/configs/datasets/humaneval/humaneval_gen_28e126.py
+++ b/configs/datasets/humaneval/humaneval_gen_28e126.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HFDataset, HumanEvaluator
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvaluator),
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type='humaneval'),
+)
+
+humaneval_datasets = [
+    dict(
+        type=HFDataset,
+        path='openai_humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/configs/datasets/lcsts/lcsts_gen_427fde.py
+++ b/configs/datasets/lcsts/lcsts_gen_427fde.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import RougeEvaluator
+from opencompass.datasets import LCSTSDataset
+
+lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst')
+
+lcsts_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(role='HUMAN', prompt='阅读以下文章，并给出简短的摘要：{content}\n摘要如下：'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+lcsts_eval_cfg = dict(
+    evaluator=dict(type=RougeEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type='lcsts'),
+)
+
+lcsts_datasets = [
+    dict(
+        type=LCSTSDataset,
+        abbr='lcsts',
+        path='./data/LCSTS',
+        reader_cfg=lcsts_reader_cfg,
+        infer_cfg=lcsts_infer_cfg,
+        eval_cfg=lcsts_eval_cfg)
+]
--- a/configs/datasets/math/math_gen_78bcba.py
+++ b/configs/datasets/math/math_gen_78bcba.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset, MATHEvaluator
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role="HUMAN",
+                prompt=
+                "Problem:\nFind the domain of the expression $\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:"
+            ),
+            dict(
+                role="BOT",
+                prompt=
+                "The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct."
+            ),
+            dict(
+                role="HUMAN",
+                prompt=
+                "Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:"
+            ),
+            dict(
+                role="BOT",
+                prompt=
+                "We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct."
+            ),
+            dict(
+                role="HUMAN",
+                prompt=
+                "Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"
+            ),
+            dict(
+                role="BOT",
+                prompt=
+                "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \begin{{align*}} 30n&=480\\ \Rightarrow\qquad n&=480/30=\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct."
+            ),
+            dict(
+                role="HUMAN",
+                prompt=
+                "Problem:\nIf the system of equations: \begin{{align*}} 6x-4y&=a,\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:"
+            ),
+            dict(
+                role="BOT",
+                prompt=
+                "If we multiply the first equation by $-\frac{{3}}{{2}}$, we obtain $$6y-9x=-\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\frac{{3}}{{2}}a=b\Rightarrow\frac{{a}}{{b}}=\boxed{{-\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\frac{{2}}{{3}}$. I hope it is correct."
+            ),
+            dict(role="HUMAN", prompt="Problem:\n{problem}\nSolution:\n"),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type='math'))
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math',
+        path='./data/math/math.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg)
+]
--- a/configs/datasets/mmlu/mmlu_gen_057057.py
+++ b/configs/datasets/mmlu/mmlu_gen_057057.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+
+mmlu_reader_cfg = dict(
+    input_columns=["input", "A", "B", "C", "D"],
+    output_column="target",
+    train_split='dev')
+
+mmlu_all_sets = [
+    "college_biology",
+    "college_chemistry",
+    "college_computer_science",
+    "college_mathematics",
+    "college_physics",
+    "electrical_engineering",
+    "astronomy",
+    "anatomy",
+    "abstract_algebra",
+    "machine_learning",
+    "clinical_knowledge",
+    "global_facts",
+    "management",
+    "nutrition",
+    "marketing",
+    "professional_accounting",
+    "high_school_geography",
+    "international_law",
+    "moral_scenarios",
+    "computer_security",
+    "high_school_microeconomics",
+    "professional_law",
+    "medical_genetics",
+    "professional_psychology",
+    "jurisprudence",
+    "world_religions",
+    "philosophy",
+    "virology",
+    "high_school_chemistry",
+    "public_relations",
+    "high_school_macroeconomics",
+    "human_sexuality",
+    "elementary_mathematics",
+    "high_school_physics",
+    "high_school_computer_science",
+    "high_school_european_history",
+    "business_ethics",
+    "moral_disputes",
+    "high_school_statistics",
+    "miscellaneous",
+    "formal_logic",
+    "high_school_government_and_politics",
+    "prehistory",
+    "security_studies",
+    "high_school_biology",
+    "logical_fallacies",
+    "high_school_world_history",
+    "professional_medicine",
+    "high_school_mathematics",
+    "college_medicine",
+    "high_school_us_history",
+    "sociology",
+    "econometrics",
+    "high_school_psychology",
+    "human_aging",
+    "us_foreign_policy",
+    "conceptual_physics",
+]
+
+mmlu_datasets = []
+for _name in mmlu_all_sets:
+    _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
+    mmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    f"{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: "
+                ),
+                dict(role="BOT", prompt="{target}\n")
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin="</E>",
+                round=[
+                    dict(
+                        role="HUMAN",
+                        prompt=
+                        f"{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: "
+                    ),
+                ],
+            ),
+            ice_token="</E>",
+        ),
+        retriever=dict(type=FixKRetriever),
+        inferencer=dict(type=GenInferencer, fix_id_list=[0, 1, 2, 3, 4]),
+    )
+
+    mmlu_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type="first-capital"))
+
+    mmlu_datasets.append(
+        dict(
+            abbr=f"lukaemon_mmlu_{_name}",
+            type=MMLUDataset,
+            path="./data/mmlu/",
+            name=_name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+        ))
+
+del _name, _hint
--- a/configs/datasets/mmlu/mmlu_gen_36560d.py
+++ b/configs/datasets/mmlu/mmlu_gen_36560d.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+mmlu_reader_cfg = dict(
+    input_columns=["input", "A", "B", "C", "D"],
+    output_column="target",
+    train_split='dev')
+
+mmlu_all_sets = [
+    "college_biology",
+    "college_chemistry",
+    "college_computer_science",
+    "college_mathematics",
+    "college_physics",
+    "electrical_engineering",
+    "astronomy",
+    "anatomy",
+    "abstract_algebra",
+    "machine_learning",
+    "clinical_knowledge",
+    "global_facts",
+    "management",
+    "nutrition",
+    "marketing",
+    "professional_accounting",
+    "high_school_geography",
+    "international_law",
+    "moral_scenarios",
+    "computer_security",
+    "high_school_microeconomics",
+    "professional_law",
+    "medical_genetics",
+    "professional_psychology",
+    "jurisprudence",
+    "world_religions",
+    "philosophy",
+    "virology",
+    "high_school_chemistry",
+    "public_relations",
+    "high_school_macroeconomics",
+    "human_sexuality",
+    "elementary_mathematics",
+    "high_school_physics",
+    "high_school_computer_science",
+    "high_school_european_history",
+    "business_ethics",
+    "moral_disputes",
+    "high_school_statistics",
+    "miscellaneous",
+    "formal_logic",
+    "high_school_government_and_politics",
+    "prehistory",
+    "security_studies",
+    "high_school_biology",
+    "logical_fallacies",
+    "high_school_world_history",
+    "professional_medicine",
+    "high_school_mathematics",
+    "college_medicine",
+    "high_school_us_history",
+    "sociology",
+    "econometrics",
+    "high_school_psychology",
+    "human_aging",
+    "us_foreign_policy",
+    "conceptual_physics",
+]
+
+mmlu_datasets = []
+for _name in mmlu_all_sets:
+    _hint = f'The following are multiple choice questions (with answers) about  {_name.replace("_", " ")}.\n\n'
+    mmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=
+            "{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: {target}\n",
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=
+            f"{_hint}</E>{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer:",
+            ice_token="</E>",
+        ),
+        retriever=dict(type=FixKRetriever),
+        inferencer=dict(type=GenInferencer, fix_id_list=[0, 1, 2, 3, 4]),
+    )
+
+    mmlu_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type="first-capital"),
+    )
+
+    mmlu_datasets.append(
+        dict(
+            abbr=f"lukaemon_mmlu_{_name}",
+            type=MMLUDataset,
+            path="./data/mmlu/",
+            name=_name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+        ))
+
+del _name, _hint
--- a/configs/datasets/mmlu/mmlu_gen_a568f1.py
+++ b/configs/datasets/mmlu/mmlu_gen_a568f1.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+
+mmlu_reader_cfg = dict(
+    input_columns=["input", "A", "B", "C", "D"],
+    output_column="target",
+    train_split='dev')
+
+mmlu_all_sets = [
+    "college_biology",
+    "college_chemistry",
+    "college_computer_science",
+    "college_mathematics",
+    "college_physics",
+    "electrical_engineering",
+    "astronomy",
+    "anatomy",
+    "abstract_algebra",
+    "machine_learning",
+    "clinical_knowledge",
+    "global_facts",
+    "management",
+    "nutrition",
+    "marketing",
+    "professional_accounting",
+    "high_school_geography",
+    "international_law",
+    "moral_scenarios",
+    "computer_security",
+    "high_school_microeconomics",
+    "professional_law",
+    "medical_genetics",
+    "professional_psychology",
+    "jurisprudence",
+    "world_religions",
+    "philosophy",
+    "virology",
+    "high_school_chemistry",
+    "public_relations",
+    "high_school_macroeconomics",
+    "human_sexuality",
+    "elementary_mathematics",
+    "high_school_physics",
+    "high_school_computer_science",
+    "high_school_european_history",
+    "business_ethics",
+    "moral_disputes",
+    "high_school_statistics",
+    "miscellaneous",
+    "formal_logic",
+    "high_school_government_and_politics",
+    "prehistory",
+    "security_studies",
+    "high_school_biology",
+    "logical_fallacies",
+    "high_school_world_history",
+    "professional_medicine",
+    "high_school_mathematics",
+    "college_medicine",
+    "high_school_us_history",
+    "sociology",
+    "econometrics",
+    "high_school_psychology",
+    "human_aging",
+    "us_foreign_policy",
+    "conceptual_physics",
+]
+
+mmlu_datasets = []
+for _name in mmlu_all_sets:
+    _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
+    mmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    f"{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: "
+                ),
+                dict(role="BOT", prompt="{target}\n")
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin="</E>",
+                round=[
+                    dict(
+                        role="HUMAN",
+                        prompt=
+                        f"{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: "
+                    ),
+                ],
+            ),
+            ice_token="</E>",
+        ),
+        retriever=dict(type=FixKRetriever),
+        inferencer=dict(type=GenInferencer, fix_id_list=[0, 1, 2, 3, 4]),
+    )
+
+    mmlu_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type="first-capital"))
+
+    mmlu_datasets.append(
+        dict(
+            abbr=f"lukaemon_mmlu_{_name}",
+            type=MMLUDataset,
+            path="./data/mmlu/",
+            name=_name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+        ))
+
+del _name, _hint
--- a/configs/datasets/nq/nq_gen.py
+++ b/configs/datasets/nq/nq_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .nq_gen_c00b89 import nq_datasets  # noqa: F401, F403
--- a/configs/datasets/nq/nq_gen_a6ffca.py
+++ b/configs/datasets/nq/nq_gen_a6ffca.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import NaturalQuestionDataset, NQEvaluator
+
+nq_reader_cfg = dict(
+    input_columns=['question'], output_column='answer', train_split='test')
+
+nq_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='Answer these questions:\nQ: {question}?'),
+                dict(role='BOT', prompt='A:'),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+nq_eval_cfg = dict(evaluator=dict(type=NQEvaluator), pred_role="BOT")
+
+nq_datasets = [
+    dict(
+        type=NaturalQuestionDataset,
+        abbr='nq',
+        path='./data/nq/',
+        reader_cfg=nq_reader_cfg,
+        infer_cfg=nq_infer_cfg,
+        eval_cfg=nq_eval_cfg)
+]
--- a/configs/datasets/piqa/piqa_ppl.py
+++ b/configs/datasets/piqa/piqa_ppl.py
+from mmengine.config import read_base
+
+with read_base():
+    from .piqa_ppl_788dbe import piqa_datasets  # noqa: F401, F403
--- a/configs/datasets/qaspercut/qaspercut_gen_943606.py
+++ b/configs/datasets/qaspercut/qaspercut_gen_943606.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import QASPERCUTDataset, TriviaQAEvaluator
+
+qaspercut_reader_cfg = dict(
+    input_columns=['question', 'evidence'],
+    output_column='answer',
+    train_split='dev',
+    test_split='dev')
+
+qaspercut_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template="{evidence}\nAnswer these questions:\nQ: {question}?\nA:"),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(
+        type=GenInferencer, max_out_len=50, max_seq_len=8192, batch_size=4))
+
+qaspercut_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator))
+
+qaspercut_datasets = [
+    dict(
+        type=QASPERCUTDataset,
+        abbr='qaspercut',
+        path='./data/QASPER/',
+        reader_cfg=qaspercut_reader_cfg,
+        infer_cfg=qaspercut_infer_cfg,
+        eval_cfg=qaspercut_eval_cfg)
+]
--- a/configs/datasets/safety/safety_gen_c0a5b8.py
+++ b/configs/datasets/safety/safety_gen_c0a5b8.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import ToxicEvaluator
+from opencompass.datasets import SafetyDataset
+
+safety_reader_cfg = dict(
+    input_columns=['prompt'],
+    output_column='idx',
+    train_split='test',
+    test_split='test')
+
+# TODO: allow empty output-column
+safety_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+safety_eval_cfg = dict(evaluator=dict(type=ToxicEvaluator), )
+
+safety_datasets = [
+    dict(
+        type=SafetyDataset,
+        path='./data/safety.txt',
+        reader_cfg=safety_reader_cfg,
+        infer_cfg=safety_infer_cfg,
+        eval_cfg=safety_eval_cfg)
+]
--- a/configs/datasets/siqa/siqa_ppl_764e42.py
+++ b/configs/datasets/siqa/siqa_ppl_764e42.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+
+siqa_reader_cfg = dict(
+    input_columns=['context', 'question', 'answerA', 'answerB', 'answerC'],
+    output_column='label',
+    test_split='validation')
+
+siqa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            1: '{context} \nQ: {question}\nA: {answerA}',
+            2: '{context} \nQ: {question}\nA: {answerB}',
+            3: '{context} \nQ: {question}\nA: {answerC}',
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+siqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+siqa_datasets = [
+    dict(
+        abbr="siqa",
+        type=HFDataset,
+        path='social_i_qa',
+        name='social_i_qa',
+        reader_cfg=siqa_reader_cfg,
+        infer_cfg=siqa_infer_cfg,
+        eval_cfg=siqa_eval_cfg)
+]
--- a/configs/datasets/siqa/siqa_ppl_b27551.py
+++ b/configs/datasets/siqa/siqa_ppl_b27551.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+
+siqa_reader_cfg = dict(
+    input_columns=['context', 'question', 'answerA', 'answerB', 'answerC'],
+    output_column='label',
+    test_split='validation')
+
+siqa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            1: 'The following makes sense:\n {context} \nQ: {question}\nA: {answerA}',
+            2: 'The following makes sense:\n {context} \nQ: {question}\nA: {answerB}',
+            3: 'The following makes sense:\n {context} \nQ: {question}\nA: {answerC}',
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+siqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+siqa_datasets = [
+    dict(
+        abbr="siqa",
+        type=HFDataset,
+        path='social_i_qa',
+        reader_cfg=siqa_reader_cfg,
+        infer_cfg=siqa_infer_cfg,
+        eval_cfg=siqa_eval_cfg)
+]
--- a/configs/datasets/summedits/summedits_ppl_f2bd6e.py
+++ b/configs/datasets/summedits/summedits_ppl_f2bd6e.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+
+summedits_reader_cfg = dict(
+    input_columns=['doc', 'summary'],
+    output_column='label',
+    test_split='train')
+
+summedits_prompt1 = "Given the document below, you have to determine if 'Yes' or 'No', the summary is factually consistent with the document."
+summedits_prompt2 = "Document:\n{doc}\nSummary:\n{summary}\nIs the summary factually consistent with the document? "
+summedits_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0:
+            dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt=summedits_prompt1)
+                ],
+                round=[
+                    dict(role="HUMAN", prompt=summedits_prompt2),
+                    dict(role="BOT", prompt="No")
+                ]),
+            1:
+            dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt=summedits_prompt1)
+                ],
+                round=[
+                    dict(role="HUMAN", prompt=summedits_prompt2),
+                    dict(role="BOT", prompt="Yes")
+                ]),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+summedits_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+summedits_datasets = [
+    dict(
+        type=HFDataset,
+        abbr='summedits',
+        path='json',
+        split='train',
+        data_files='./data/summedits/summedits.jsonl',
+        reader_cfg=summedits_reader_cfg,
+        infer_cfg=summedits_infer_cfg,
+        eval_cfg=summedits_eval_cfg)
+]