Initial commit

be3dfa50 · jerrrrry · be3dfa50 · be3dfa50 · be3dfa50 · be3dfa50
Commit be3dfa50 authored Aug 06, 2025 by jerrrrry
20 changed files
--- a/opencompass/configs/datasets/hellaswag/hellaswag_ppl_7d7f2d.py
+++ b/opencompass/configs/datasets/hellaswag/hellaswag_ppl_7d7f2d.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HellaswagDataset_V2
+
+hellaswag_reader_cfg = dict(
+    input_columns=['query', 'A', 'B', 'C', 'D'],
+    output_column='label')
+
+hellaswag_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            ans: dict(round=[
+                dict(role='HUMAN', prompt='{ctx}\nQuestion: Which ending makes the most sense?\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: '),
+                dict(role='BOT', prompt=f'{ans}'),
+            ]) for ans in ['A', 'B', 'C', 'D']
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+hellaswag_datasets = [
+    dict(
+        abbr='hellaswag',
+        type=HellaswagDataset_V2,
+        path='opencompass/hellaswag',
+        reader_cfg=hellaswag_reader_cfg,
+        infer_cfg=hellaswag_infer_cfg,
+        eval_cfg=hellaswag_eval_cfg)
+]
--- a/opencompass/configs/datasets/hellaswag/hellaswag_ppl_9dbb12.py
+++ b/opencompass/configs/datasets/hellaswag/hellaswag_ppl_9dbb12.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HellaswagDataset
+
+hellaswag_reader_cfg = dict(
+    input_columns=['ctx', 'A', 'B', 'C', 'D'],
+    output_column='label'
+)
+
+hellaswag_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0: '{ctx} {A}',
+            1: '{ctx} {B}',
+            2: '{ctx} {C}',
+            3: '{ctx} {D}',
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+hellaswag_datasets = [
+    dict(
+        abbr='hellaswag',
+        type=HellaswagDataset,
+        path='opencompass/hellaswag',
+        reader_cfg=hellaswag_reader_cfg,
+        infer_cfg=hellaswag_infer_cfg,
+        eval_cfg=hellaswag_eval_cfg)
+]
--- a/opencompass/configs/datasets/hellaswag/hellaswag_ppl_a6e128.py
+++ b/opencompass/configs/datasets/hellaswag/hellaswag_ppl_a6e128.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HellaswagDataset_V3
+
+hellaswag_reader_cfg = dict(
+    input_columns=['query', 'A', 'B', 'C', 'D'],
+    output_column='gold')
+
+hellaswag_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            '0': dict(
+                round=[dict(role='HUMAN', prompt='{query} {A}')]
+            ),
+            '1': dict(
+                round=[dict(role='HUMAN', prompt='{query} {B}')]
+            ),
+            '2': dict(
+                round=[dict(role='HUMAN', prompt='{query} {C}')]
+            ),
+            '3': dict(
+                round=[dict(role='HUMAN', prompt='{query} {D}')]
+            ),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+hellaswag_datasets = [
+    dict(
+        abbr='hellaswag',
+        type=HellaswagDataset_V3,
+        path='opencompass/hellaswag',
+        reader_cfg=hellaswag_reader_cfg,
+        infer_cfg=hellaswag_infer_cfg,
+        eval_cfg=hellaswag_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/README.md
+++ b/opencompass/configs/datasets/humaneval/README.md
+# HumanEval
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets deprecated_humaneval_gen_d2537e --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
+```
+
+## Base Models
+
+|          model           |   pass@1 |
+|:------------------------:|---------:|
+|    llama-7b-turbomind    |    12.80 |
+|   llama-13b-turbomind    |    15.24 |
+|   llama-30b-turbomind    |     9.15 |
+|   llama-65b-turbomind    |     7.32 |
+|   llama-2-7b-turbomind   |    14.02 |
+|  llama-2-13b-turbomind   |    15.24 |
+|  llama-2-70b-turbomind   |    15.24 |
+|   llama-3-8b-turbomind   |    28.05 |
+|  llama-3-70b-turbomind   |    28.05 |
+| internlm2-1.8b-turbomind |    30.49 |
+|  internlm2-7b-turbomind  |    48.17 |
+| internlm2-20b-turbomind  |    51.83 |
+|   qwen-1.8b-turbomind    |    16.46 |
+|    qwen-7b-turbomind     |    23.78 |
+|    qwen-14b-turbomind    |    23.78 |
+|    qwen-72b-turbomind    |    66.46 |
+|     qwen1.5-0.5b-hf      |     8.54 |
+|     qwen1.5-1.8b-hf      |    23.17 |
+|      qwen1.5-4b-hf       |    41.46 |
+|      qwen1.5-7b-hf       |    53.05 |
+|      qwen1.5-14b-hf      |    57.32 |
+|      qwen1.5-32b-hf      |    70.12 |
+|      qwen1.5-72b-hf      |    65.85 |
+|   qwen1.5-moe-a2-7b-hf   |    45.73 |
+|    mistral-7b-v0.1-hf    |    14.02 |
+|    mistral-7b-v0.2-hf    |     9.15 |
+|   mixtral-8x7b-v0.1-hf   |    24.39 |
+|  mixtral-8x22b-v0.1-hf   |    16.46 |
+|         yi-6b-hf         |    14.63 |
+|        yi-34b-hf         |    17.07 |
+|   deepseek-7b-base-hf    |    18.29 |
+|   deepseek-67b-base-hf   |    23.17 |
+
+## Chat Models
+
+|             model             |   pass@1 |
+|:-----------------------------:|---------:|
+|     qwen1.5-0.5b-chat-hf      |     9.15 |
+|     qwen1.5-1.8b-chat-hf      |    15.85 |
+|      qwen1.5-4b-chat-hf       |    30.49 |
+|      qwen1.5-7b-chat-hf       |    40.85 |
+|      qwen1.5-14b-chat-hf      |    50.00 |
+|      qwen1.5-32b-chat-hf      |    57.93 |
+|      qwen1.5-72b-chat-hf      |    60.37 |
+|     qwen1.5-110b-chat-hf      |    65.24 |
+|    internlm2-chat-1.8b-hf     |    33.54 |
+|  internlm2-chat-1.8b-sft-hf   |    34.15 |
+|     internlm2-chat-7b-hf      |    56.71 |
+|   internlm2-chat-7b-sft-hf    |    61.59 |
+|     internlm2-chat-20b-hf     |    67.68 |
+|   internlm2-chat-20b-sft-hf   |    67.68 |
+|    llama-3-8b-instruct-hf     |    55.49 |
+|    llama-3-70b-instruct-hf    |    70.73 |
+| llama-3-8b-instruct-lmdeploy  |    57.93 |
+| llama-3-70b-instruct-lmdeploy |    70.73 |
+|  mistral-7b-instruct-v0.1-hf  |    32.32 |
+|  mistral-7b-instruct-v0.2-hf  |    29.27 |
+| mixtral-8x7b-instruct-v0.1-hf |    34.15 |
--- a/opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py
+++ b/opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a Python script for this problem:\n{prompt}\n\n### Response:\n'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py
+++ b/opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py
+++ b/opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py
+++ b/opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='Complete the following python code:\n{prompt}',
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py
+++ b/opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py
+++ b/opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='Complete the following python code:'),
+            ],
+            round=[
+                dict(role='HUMAN', prompt='{prompt}'),
+            ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/humaneval_gen.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .humaneval_gen_8e312c import humaneval_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/humaneval/humaneval_gen_66a7f4.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_gen_66a7f4.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+
+humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
+
+HUMANEVAL_TEMPLATE = dict(
+    round=[
+        dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
+    ]
+)
+
+humaneval_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    k=[1, 10, 100],
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/humaneval/humaneval_gen_8e312c.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_gen_8e312c.py
+# THIS SHALL ALSO BE DEPRECATED
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Complete the following python code:\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_dcae0e.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_dcae0e.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_o1_gen_5e7b00.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_o1_gen_5e7b00.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v3
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=8192))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v3),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval_o1_style',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Complete the following python code:\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval_passk',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Complete the following python code:\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval_repeat10',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        num_repeats=10,
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/internal_humaneval_gen_ce6b06.py
+++ b/opencompass/configs/datasets/humaneval/internal_humaneval_gen_ce6b06.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v2_postprocess
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='# Complete the following python code:\n{prompt}',
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_internal_v2_postprocess),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]
--- a/opencompass/configs/datasets/humaneval/internal_humaneval_gen_d2537e.py
+++ b/opencompass/configs/datasets/humaneval/internal_humaneval_gen_d2537e.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_internal_v1_postprocess
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='Complete the following python code:\n{prompt}',
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_internal_v1_postprocess),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]