Initial commit

c289ecc0 · xinghao · c289ecc0 · c289ecc0 · c289ecc0 · c289ecc0
Commit c289ecc0 authored Oct 21, 2025 by xinghao
10 changed files
--- a/opencompass/configs/datasets/commonsenseqa/commonsenseqa_ppl_5545e2.py
+++ b/opencompass/configs/datasets/commonsenseqa/commonsenseqa_ppl_5545e2.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import MDLRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import commonsenseqaDataset
+
+commonsenseqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
+    output_column='answerKey',
+    test_split='validation')
+
+_ice_template = dict(
+    type=PromptTemplate,
+    template={
+        ans: dict(
+            begin='</E>',
+            round=[
+                dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                dict(role='BOT', prompt=ans_token),
+            ])
+        for ans, ans_token in [['A', '{A}'], ['B', '{B}'],
+                               ['C', '{C}'], ['D', '{D}'],
+                               ['E', '{E}']]
+    },
+    ice_token='</E>')
+
+commonsenseqa_infer_cfg = dict(
+    ice_template=_ice_template,
+    retriever=dict(
+        type=MDLRetriever,
+        ice_num=8,
+        candidate_num=30,
+        select_time=10,
+        seed=1,
+        batch_size=12,
+        ice_template=_ice_template),
+    inferencer=dict(type=PPLInferencer))
+
+commonsenseqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+commonsenseqa_datasets = [
+    dict(
+        abbr='commonsense_qa',
+        type=commonsenseqaDataset,
+        path='opencompass/commonsense_qa',
+        reader_cfg=commonsenseqa_reader_cfg,
+        infer_cfg=commonsenseqa_infer_cfg,
+        eval_cfg=commonsenseqa_eval_cfg)
+]
--- a/opencompass/configs/datasets/commonsenseqa/commonsenseqa_ppl_716f78.py
+++ b/opencompass/configs/datasets/commonsenseqa/commonsenseqa_ppl_716f78.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import MDLRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import commonsenseqaDataset
+
+commonsenseqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
+    output_column='answerKey',
+    test_split='validation')
+
+_ice_template = dict(
+    type=PromptTemplate,
+    template={
+        'A': '</E>Answer the following question:\n{question}\nAnswer: {A}',
+        'B': '</E>Answer the following question:\n{question}\nAnswer: {B}',
+        'C': '</E>Answer the following question:\n{question}\nAnswer: {C}',
+        'D': '</E>Answer the following question:\n{question}\nAnswer: {D}',
+        'E': '</E>Answer the following question:\n{question}\nAnswer: {E}',
+    },
+    ice_token='</E>')
+
+commonsenseqa_infer_cfg = dict(
+    ice_template=_ice_template,
+    retriever=dict(
+        type=MDLRetriever,
+        ice_num=8,
+        candidate_num=30,
+        select_time=10,
+        seed=1,
+        batch_size=12,
+        ice_template=_ice_template),
+    inferencer=dict(type=PPLInferencer))
+
+commonsenseqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+commonsenseqa_datasets = [
+    dict(
+        abbr='commonsense_qa',
+        type=commonsenseqaDataset,
+        path='opencompass/commonsense_qa',
+        reader_cfg=commonsenseqa_reader_cfg,
+        infer_cfg=commonsenseqa_infer_cfg,
+        eval_cfg=commonsenseqa_eval_cfg)
+]
--- a/opencompass/configs/datasets/commonsenseqa/commonsenseqa_ppl_c49e77.py
+++ b/opencompass/configs/datasets/commonsenseqa/commonsenseqa_ppl_c49e77.py
+# Use FixKRetriever to avoid hang caused by the Huggingface
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import commonsenseqaDataset
+
+commonsenseqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
+    output_column='answerKey',
+    test_split='validation')
+
+_ice_template = dict(
+    type=PromptTemplate,
+    template={
+        ans: dict(
+            begin='</E>',
+            round=[
+                dict(role='HUMAN', prompt='Question: {question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nAnswer: '),
+                dict(role='BOT', prompt=f'{ans}'),
+            ])
+        for ans in ['A', 'B', 'C', 'D', 'E']
+    },
+    ice_token='</E>')
+
+commonsenseqa_infer_cfg = dict(
+    ice_template=_ice_template,
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4, 5, 6, 7]),
+    inferencer=dict(type=PPLInferencer))
+
+commonsenseqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+commonsenseqa_datasets = [
+    dict(
+        abbr='commonsense_qa',
+        type=commonsenseqaDataset,
+        path='opencompass/commonsense_qa',
+        reader_cfg=commonsenseqa_reader_cfg,
+        infer_cfg=commonsenseqa_infer_cfg,
+        eval_cfg=commonsenseqa_eval_cfg)
+]
--- a/opencompass/configs/datasets/commonsenseqa/commonsenseqa_ppl_e51e32.py
+++ b/opencompass/configs/datasets/commonsenseqa/commonsenseqa_ppl_e51e32.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import commonsenseqaDataset
+
+commonsenseqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
+    output_column='answerKey',
+    test_split='validation')
+
+_ice_template = dict(
+    type=PromptTemplate,
+    template={
+        ans: dict(
+            begin='</E>',
+            round=[
+                dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                dict(role='BOT', prompt=ans_token),
+            ])
+        for ans, ans_token in [['A', '{A}'], ['B', '{B}'],
+                               ['C', '{C}'], ['D', '{D}'],
+                               ['E', '{E}']]
+    },
+    ice_token='</E>')
+
+commonsenseqa_infer_cfg = dict(
+    ice_template=_ice_template,
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4, 5, 6, 7]),
+    inferencer=dict(type=PPLInferencer))
+
+commonsenseqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+commonsenseqa_datasets = [
+    dict(
+        abbr='commonsense_qa',
+        type=commonsenseqaDataset,
+        path='opencompass/commonsense_qa',
+        reader_cfg=commonsenseqa_reader_cfg,
+        infer_cfg=commonsenseqa_infer_cfg,
+        eval_cfg=commonsenseqa_eval_cfg)
+]
--- a/opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py
+++ b/opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .commonsenseqacn_gen_d380d0 import commonsenseqacn_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen_d380d0.py
+++ b/opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen_d380d0.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CommonsenseQADataset_CN
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+commonsenseqacn_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
+    output_column='answerKey',
+    test_split='validation',
+)
+
+_ice_template = dict(
+    type=PromptTemplate,
+    template=dict(
+        begin='</E>',
+        round=[
+            dict(
+                role='HUMAN',
+                prompt='{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\n答案：',
+            ),
+            dict(role='BOT', prompt='{answerKey}'),
+        ],
+    ),
+    ice_token='</E>',
+)
+
+
+commonsenseqacn_infer_cfg = dict(
+    prompt_template=_ice_template,
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+commonsenseqacn_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+
+commonsenseqacn_datasets = [
+    dict(
+        abbr='commonsenseqa_cn',
+        type=CommonsenseQADataset_CN,
+        path='./data/commonsenseqa_cn/validation.jsonl',
+        reader_cfg=commonsenseqacn_reader_cfg,
+        infer_cfg=commonsenseqacn_infer_cfg,
+        eval_cfg=commonsenseqacn_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl.py
+++ b/opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl.py
+from mmengine.config import read_base
+
+with read_base():
+    from .commonsenseqacn_ppl_971f48 import commonsenseqacn_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl_971f48.py
+++ b/opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl_971f48.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CommonsenseQADataset_CN
+
+commonsenseqacn_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
+    output_column='answerKey',
+    test_split='validation',
+)
+
+_ice_template = dict(
+    type=PromptTemplate,
+    template={
+        ans: dict(
+            begin='</E>',
+            round=[
+                dict(role='HUMAN', prompt='问题: {question}\n答案: '),
+                dict(role='BOT', prompt=ans_token),
+            ],
+        )
+        for ans, ans_token in [
+            ['A', '{A}'],
+            ['B', '{B}'],
+            ['C', '{C}'],
+            ['D', '{D}'],
+            ['E', '{E}'],
+        ]
+    },
+    ice_token='</E>',
+)
+
+
+commonsenseqacn_infer_cfg = dict(
+    prompt_template=_ice_template,
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer),
+)
+
+commonsenseqacn_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+commonsenseqacn_datasets = [
+    dict(
+        abbr='commonsenseqa_cn',
+        type=CommonsenseQADataset_CN,
+        path='./data/commonsenseqa_cn/validation.jsonl',
+        reader_cfg=commonsenseqacn_reader_cfg,
+        infer_cfg=commonsenseqacn_infer_cfg,
+        eval_cfg=commonsenseqacn_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/compassbench_20_v1_1/agent/cibench_template_gen_e6b12a.py
+++ b/opencompass/configs/datasets/compassbench_20_v1_1/agent/cibench_template_gen_e6b12a.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import AgentInferencer
+from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
+
+libs = [
+    '/lightgbm',
+    '/matplotlib',
+    '/nltk',
+    '/opencv',
+    '/pandas',
+    '/pytorch',
+    '/scipy',
+    '/seaborn',
+    '/sklearn',
+    '/tensorflow',
+    '_chinese/lightgbm',
+    '_chinese/matplotlib',
+    '_chinese/nltk',
+    '_chinese/opencv',
+    '_chinese/pandas',
+    '_chinese/pytorch',
+    '_chinese/scipy',
+    '_chinese/seaborn',
+    '_chinese/sklearn',
+    '_chinese/tensorflow',
+]
+
+
+cibench_datasets = []
+for lib in libs:
+    cibench_reader_cfg = dict(
+        input_columns=['questions'], output_column='references', train_split='test', test_split='test'
+    )
+
+    cibench_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template='{questions}',
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=AgentInferencer, infer_mode='every'),
+    )
+
+    cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role='BOT')
+
+    cibench_datasets.append(
+        dict(
+            abbr=f'cibench_template{lib}',
+            type=CIBenchDataset,
+            path=f'data/compassbench_v1.1/agent-cibench/cibench_template{lib}',
+            internet_check=False,
+            reader_cfg=cibench_reader_cfg,
+            infer_cfg=cibench_infer_cfg,
+            eval_cfg=cibench_eval_cfg,
+        )
+    )
--- a/opencompass/configs/datasets/compassbench_20_v1_1/agent/mus_teval_gen_105c48.py
+++ b/opencompass/configs/datasets/compassbench_20_v1_1/agent/mus_teval_gen_105c48.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer
+from opencompass.openicl.icl_evaluator import TEvalEvaluator
+from opencompass.datasets import teval_postprocess, TEvalDataset
+
+plugin_eval_subject_mapping = {
+    'instruct': ['instruct_v1'],
+    'instruct_zh': ['instruct_v1_zh'],
+    'plan': ['plan_json_v1', 'plan_str_v1'],
+    'plan_zh': ['plan_json_v1_zh', 'plan_str_v1_zh'],
+    'review': ['review_str_v1'],
+    'review_zh': ['review_str_v1_zh'],
+    'reason_retrieve_understand': ['reason_retrieve_understand_json_v1'],
+    'reason_retrieve_understand_zh': ['reason_retrieve_understand_json_v1_zh'],
+    'reason': ['reason_str_v1'],
+    'reason_zh': ['reason_str_v1_zh'],
+    'retrieve': ['retrieve_str_v1'],
+    'retrieve_zh': ['retrieve_str_v1_zh'],
+    'understand': ['understand_str_v1'],
+    'understand_zh': ['understand_str_v1_zh'],
+}
+
+plugin_eval_datasets = []
+for _name in plugin_eval_subject_mapping:
+    plugin_eval_reader_cfg = dict(input_columns=['prompt'], output_column='ground_truth')
+    plugin_eval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=ChatInferencer),
+    )
+    plugin_eval_eval_cfg = dict(
+        evaluator=dict(type=TEvalEvaluator, subset=_name),
+        pred_postprocessor=dict(type=teval_postprocess),
+        num_gpus=1,
+    )
+
+    for subset in plugin_eval_subject_mapping[_name]:
+        plugin_eval_datasets.append(
+            dict(
+                abbr='plugin_eval-mus-p10-' + subset,
+                type=TEvalDataset,
+                path='data/compassbench_v1.1/agent-teval-p10',
+                name=subset,
+                reader_cfg=plugin_eval_reader_cfg,
+                infer_cfg=plugin_eval_infer_cfg,
+                eval_cfg=plugin_eval_eval_cfg,
+            )
+        )