initial commit

7d346000 · gaotongxiao · 7d346000 · 7d346000 · 7d346000 · 7d346000
Commit 7d346000 authored Jul 04, 2023 by gaotongxiao
20 changed files
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
+<div align="center">
+  <img src="https://user-images.githubusercontent.com/22607038/250798681-b52045d2-cedd-4070-84e2-410903ac404f.png" width="500px"/>
+
+[![docs](https://readthedocs.org/projects/opencompass/badge/?version=dev-1.x)](https://opencompass.readthedocs.io/en/dev-1.x/?badge=dev-1.x)
+[![license](https://img.shields.io/github/license/IntenLM/opencompass.svg)](https://github.com/InternLM/opencompass/blob/main/LICENSE)
+[![PyPI](https://badge.fury.io/py/opencompass.svg)](https://pypi.org/project/opencompass/)
+
+[📘Documentation](https://opencompass.readthedocs.io/en/latest/) |
+[🛠️Installation](https://opencompass.readthedocs.io/en/latest/get_started/install.html) |
+[🤔Reporting Issues](https://github.com/InternLM/opencompass/issues/new/choose)
+
+[English](/README.md) | 简体中文
+
+</div>
+
+## 介绍
+
+OpenCompass 是面向大模型评测的一站式平台，旨在提供一套公平、公开、可复现的大模型评测基准方案。其主要特点如下：
+
+- **模型及数据集的全方位支持**：预支持 20+ HuggingFace 及 API 模型，并提供 50+ 个数据集约 30 万题的的模型评测方案，6 大维度的能力全面评测。
+
+- **高效分布式评测**：一行命令实现任务分割和分布式评测，数小时即可完成千亿模型全量评测\*。
+
+- **多样化评测范式**：支持零样本、小样本及思维链评测，结合标准型或对话型提示词模板，轻松激发各种模型最大性能。
+
+- **易于扩展的模块化设计**：想增加新模型或数据集？想要自定义更高级的任务分割策略，甚至接入新的集群管理系统？OpenCompass 的一切均可轻松扩展！
+
+- **完善的实验记录及上报机制**：使用配置文件完整记录每一次实验，关键信息有迹可循；结果实时上报飞书机器人，第一时间知晓实验情况。
+
+## 模型能力排名
+
+## 能力维度 & 模型支持
+
+## 安装
+
+下面展示了快速安装的步骤。有部分第三方功能可能需要额外步骤才能正常运行，详细步骤请参考[安装指南](https://opencompass.readthedocs.io/zh_cn/latest/get_started.html)。
+
+```Python
+conda create --name opencompass python=3.8 pytorch torchvision -c pytorch -y
+conda activate opencompass
+git clone https://github.com/InternLM/opencompass opencompass
+cd opencompass
+pip install -r requirements/runtime.txt
+pip install -e .
+# 下载数据集到 data/ 处
+# TODO: ....
+```
+
+## 评测
+
+请阅读[快速上手](https://opencompass.readthedocs.io/zh_cn/latest/get_started.html)了解如何运行一个评测任务。
+
+## 致谢
+
+该项目部分的代码引用并修改自 [OpenICL](https://github.com/Shark-NLP/OpenICL)。
+
+## 引用
+
+```bibtex
+@misc{2023opencompass,
+    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
+    author={OpenCompass Contributors},
+    howpublished = {\url{https://github.com/InternLM/OpenCompass}},
+    year={2023}
+}
+```
--- a/configs/datasets/ARC_c/ARC_c_ppl.py
+++ b/configs/datasets/ARC_c/ARC_c_ppl.py
+from mmengine.config import read_base
+
+with read_base():
+    from .ARC_c_ppl_ba951c import ARC_c_datasets  # noqa: F401, F403
--- a/configs/datasets/ARC_c/ARC_c_ppl_ba951c.py
+++ b/configs/datasets/ARC_c/ARC_c_ppl_ba951c.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            "A":
+            dict(
+                round=[
+                    dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
+                    dict(role="BOT", prompt="{textA}")
+                ], ),
+            "B":
+            dict(
+                round=[
+                    dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
+                    dict(role="BOT", prompt="{textB}")
+                ], ),
+            "C":
+            dict(
+                round=[
+                    dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
+                    dict(role="BOT", prompt="{textC}")
+                ], ),
+            "D":
+            dict(
+                round=[
+                    dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
+                    dict(role="BOT", prompt="{textD}")
+                ], ),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+ARC_c_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-c',
+        path='./data/ARC/ARC-c/ARC-Challenge-Dev.jsonl',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg)
+]
--- a/configs/datasets/ARC_e/ARC_e_gen.py
+++ b/configs/datasets/ARC_e/ARC_e_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .ARC_e_gen_0a29bf import ARC_e_datasets  # noqa: F401, F403
--- a/configs/datasets/CLUE_C3/CLUE_C3_ppl.py
+++ b/configs/datasets/CLUE_C3/CLUE_C3_ppl.py
+from mmengine.config import read_base
+
+with read_base():
+    from .CLUE_C3_ppl_588820 import C3_datasets  # noqa: F401, F403
--- a/configs/datasets/CLUE_C3/CLUE_C3_ppl_20320d.py
+++ b/configs/datasets/CLUE_C3/CLUE_C3_ppl_20320d.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import C3Dataset
+
+C3_reader_cfg = dict(
+    input_columns=[
+        'question', 'content', 'choice0', 'choice1', 'choice2', 'choice3',
+        'choices'
+    ],
+    output_column='label')
+
+C3_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0: "文章：{content}\n问题：{question}\n答案：{choice0}",
+            1: "文章：{content}\n问题：{question}\n答案：{choice1}",
+            2: "文章：{content}\n问题：{question}\n答案：{choice2}",
+            3: "文章：{content}\n问题：{question}\n答案：{choice3}"
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+C3_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+C3_datasets = [
+    dict(
+        type=C3Dataset,
+        abbr='C3',
+        path='./data/CLUE/C3/dev_0.json',
+        reader_cfg=C3_reader_cfg,
+        infer_cfg=C3_infer_cfg,
+        eval_cfg=C3_eval_cfg)
+]
--- a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_698c27.py
+++ b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_698c27.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import EMEvaluator
+from opencompass.datasets import DRCDDataset
+
+DRCD_reader_cfg = dict(
+    input_columns=['question', 'context'], output_column='answers')
+
+DRCD_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template="文章：{context}\n根据上文，回答如下问题： {question}\n答："),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+DRCD_eval_cfg = dict(evaluator=dict(type=EMEvaluator), )
+
+DRCD_datasets = [
+    dict(
+        type=DRCDDataset,
+        abbr='DRCD_dev',
+        path='./data/CLUE/DRCD/dev.json',
+        reader_cfg=DRCD_reader_cfg,
+        infer_cfg=DRCD_infer_cfg,
+        eval_cfg=DRCD_eval_cfg),
+]
--- a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_9b30c2.py
+++ b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_9b30c2.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import EMEvaluator
+from opencompass.datasets import DRCDDataset
+
+DRCD_reader_cfg = dict(
+    input_columns=['question', 'context'], output_column='answers')
+
+DRCD_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(role="HUMAN", prompt="文章：{context}\n根据上文，回答如下问题：{question}"),
+            dict(role="BOT", prompt="答："),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+DRCD_eval_cfg = dict(
+    evaluator=dict(type=EMEvaluator),
+    pred_role="BOT",
+)
+
+DRCD_datasets = [
+    dict(
+        type=DRCDDataset,
+        abbr='DRCD_dev',
+        path='./data/CLUE/DRCD/dev.json',
+        reader_cfg=DRCD_reader_cfg,
+        infer_cfg=DRCD_infer_cfg,
+        eval_cfg=DRCD_eval_cfg),
+]
--- a/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_2ea62b.py
+++ b/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_2ea62b.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import cmnliDataset_V2
+
+cmnli_reader_cfg = dict(
+    input_columns=["sentence1", "sentence2"],
+    output_column="label",
+    test_split="train")
+
+cmnli_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role="HUMAN",
+                prompt=
+                "阅读文章：{sentence1}\n根据上文，回答如下问题：{sentence2}\nA. 对\nB. 错\nC. 可能\n请从“A”，“B”，“C”中进行选择。\n答："
+            ),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+cmnli_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role="BOT",
+    pred_postprocessor=dict(type="first-capital"),
+)
+
+cmnli_datasets = [
+    dict(
+        abbr="cmnli",
+        type=cmnliDataset_V2,
+        path="./data/CLUE/cmnli/cmnli_public/dev.json",
+        reader_cfg=cmnli_reader_cfg,
+        infer_cfg=cmnli_infer_cfg,
+        eval_cfg=cmnli_eval_cfg,
+    )
+]
--- a/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_316313.py
+++ b/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_316313.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import cmnliDataset_V2
+
+cmnli_reader_cfg = dict(
+    input_columns=["sentence1", "sentence2"],
+    output_column="label",
+    test_split="train")
+
+cmnli_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role="HUMAN",
+                prompt=
+                "语句一：“{sentence1}”\n语句二：“{sentence2}”\n请问这两句话是什么关系？\nA. 蕴含\nB. 矛盾\nC. 无关\n请从“A”，“B”，“C”中进行选择。\n答："
+            ),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+cmnli_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role="BOT",
+    pred_postprocessor=dict(type="first-capital"),
+)
+
+cmnli_datasets = [
+    dict(
+        abbr="cmnli",
+        type=cmnliDataset_V2,
+        path="./data/CLUE/cmnli/cmnli_public/dev.json",
+        reader_cfg=cmnli_reader_cfg,
+        infer_cfg=cmnli_infer_cfg,
+        eval_cfg=cmnli_eval_cfg,
+    )
+]
--- a/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_1c652a.py
+++ b/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_1c652a.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+
+cmnli_reader_cfg = dict(
+    input_columns=['sentence1', 'sentence2'],
+    output_column='label',
+    test_split='train')
+
+cmnli_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'contradiction':
+            dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt="语句一：“{sentence1}”\n语句二：“{sentence2}”\n请问这两句话是什么关系？"
+                ),
+                dict(role="BOT", prompt="矛盾")
+            ]),
+            'entailment':
+            dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt="语句一：“{sentence1}”\n语句二：“{sentence2}”\n请问这两句话是什么关系？"
+                ),
+                dict(role="BOT", prompt="蕴含")
+            ]),
+            'neutral':
+            dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt="语句一：“{sentence1}”\n语句二：“{sentence2}”\n请问这两句话是什么关系？"
+                ),
+                dict(role="BOT", prompt="无关")
+            ]),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+cmnli_datasets = [
+    dict(
+        type=HFDataset,
+        abbr='cmnli',
+        path='json',
+        split='train',
+        data_files='./data/CLUE/cmnli/cmnli_public/dev.json',
+        reader_cfg=cmnli_reader_cfg,
+        infer_cfg=cmnli_infer_cfg,
+        eval_cfg=cmnli_eval_cfg)
+]
--- a/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_7c44b0.py
+++ b/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_7c44b0.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import cmnliDataset_V2
+
+ocnli_reader_cfg = dict(
+    input_columns=["sentence1", "sentence2"],
+    output_column="label",
+)
+
+# TODO: two prompt templates for ocnli
+ocnli_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role="HUMAN",
+                prompt=
+                "语句一：“{sentence1}”\n语句二：“{sentence2}”\n请问这两句话是什么关系？\nA. 蕴含\n B. 矛盾\n C. 无关\n请从“A”，“B”，“C”中进行选择。\n答："
+            ),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+ocnli_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role="BOT",
+    pred_postprocessor=dict(type="first-capital"),
+)
+
+ocnli_datasets = [
+    dict(
+        abbr="ocnli",
+        type=cmnliDataset_V2,  # ocnli share the same format with cmnli
+        path="./data/CLUE/OCNLI/dev.json",
+        reader_cfg=ocnli_reader_cfg,
+        infer_cfg=ocnli_infer_cfg,
+        eval_cfg=ocnli_eval_cfg,
+    )
+]
--- a/configs/datasets/CLUE_ocnli/CLUE_ocnli_ppl_eaa2be.py
+++ b/configs/datasets/CLUE_ocnli/CLUE_ocnli_ppl_eaa2be.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+
+ocnli_reader_cfg = dict(
+    input_columns=['sentence1', 'sentence2'], output_column='label')
+
+# TODO: two prompt templates for ocnli
+ocnli_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'contradiction':
+            dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt="阅读文章：{sentence1}\n根据上文，回答如下问题：{sentence2}？"),
+                dict(role="BOT", prompt="错")
+            ]),
+            'entailment':
+            dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt="阅读文章：{sentence1}\n根据上文，回答如下问题：{sentence2}？"),
+                dict(role="BOT", prompt="对")
+            ]),
+            'neutral':
+            dict(round=[
+                dict(
+                    role="HUMAN", prompt="如果{sentence1}为真，那么{sentence2}也为真吗？"),
+                dict(role="BOT", prompt="可能")
+            ]),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+ocnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
+
+ocnli_datasets = [
+    dict(
+        type=HFDataset,
+        abbr='ocnli',
+        path='json',
+        split='train',
+        data_files='./data/CLUE/OCNLI/dev.json',
+        reader_cfg=ocnli_reader_cfg,
+        infer_cfg=ocnli_infer_cfg,
+        eval_cfg=ocnli_eval_cfg)
+]
--- a/configs/datasets/CLUE_ocnli/CLUE_ocnli_ppl_f103ab.py
+++ b/configs/datasets/CLUE_ocnli/CLUE_ocnli_ppl_f103ab.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+
+ocnli_reader_cfg = dict(
+    input_columns=['sentence1', 'sentence2'], output_column='label')
+
+# TODO: two prompt templates for ocnli
+ocnli_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'contradiction':
+            dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt="语句一：“{sentence1}”\n语句二：“{sentence2}”\n请问这两句话是什么关系？"
+                ),
+                dict(role="BOT", prompt="矛盾")
+            ]),
+            'entailment':
+            dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt="语句一：“{sentence1}”\n语句二：“{sentence2}”\n请问这两句话是什么关系？"
+                ),
+                dict(role="BOT", prompt="蕴含")
+            ]),
+            'neutral':
+            dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt="语句一：“{sentence1}”\n语句二：“{sentence2}”\n请问这两句话是什么关系？"
+                ),
+                dict(role="BOT", prompt="无关")
+            ]),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+ocnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
+
+ocnli_datasets = [
+    dict(
+        type=HFDataset,
+        abbr='ocnli',
+        path='json',
+        split='train',
+        data_files='./data/CLUE/OCNLI/dev.json',
+        reader_cfg=ocnli_reader_cfg,
+        infer_cfg=ocnli_infer_cfg,
+        eval_cfg=ocnli_eval_cfg)
+]
--- a/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_332a41.py
+++ b/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_332a41.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+
+bustm_reader_cfg = dict(
+    input_columns=['sentence1', 'sentence2'],
+    output_column='label',
+    test_split='train')
+
+bustm_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0:
+            dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="请判断以下两句话说的是否是一个意思：")
+                ],
+                round=[
+                    dict(role="HUMAN", prompt="{sentence1}，{sentence2}"),
+                    dict(role="BOT", prompt="两句话说的毫不相关。")
+                ]),
+            1:
+            dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="请判断以下两句话说的是否是一个意思：")
+                ],
+                round=[
+                    dict(role="HUMAN", prompt="{sentence1}，{sentence2}"),
+                    dict(role="BOT", prompt="两句话说是的一个意思。")
+                ]),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+bustm_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+bustm_datasets = [
+    dict(
+        type=HFDataset,
+        abbr='bustm-dev',
+        path='json',
+        data_files='./data/FewCLUE/bustm/dev_few_all.json',
+        split='train',
+        reader_cfg=bustm_reader_cfg,
+        infer_cfg=bustm_infer_cfg,
+        eval_cfg=bustm_eval_cfg),
+    dict(
+        type=HFDataset,
+        abbr='bustm-test',
+        path='json',
+        data_files='./data/FewCLUE/bustm/test_public.json',
+        split='train',
+        reader_cfg=bustm_reader_cfg,
+        infer_cfg=bustm_infer_cfg,
+        eval_cfg=bustm_eval_cfg)
+]
--- a/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_47f2ab.py
+++ b/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_47f2ab.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+
+bustm_reader_cfg = dict(
+    input_columns=['sentence1', 'sentence2'],
+    output_column='label',
+    test_split='train')
+
+bustm_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0:
+            dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    "语句一：“{sentence1}”\n语句二：“{sentence2}”\n请判断语句一和语句二说的是否是一个意思？"
+                ),
+                dict(role="BOT", prompt="两句话说的毫不相关。")
+            ]),
+            1:
+            dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    "语句一：“{sentence1}”\n语句二：“{sentence2}”\n请判断语句一和语句二说的是否是一个意思？"
+                ),
+                dict(role="BOT", prompt="两句话说是的一个意思。")
+            ]),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+bustm_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+bustm_datasets = [
+    dict(
+        type=HFDataset,
+        abbr='bustm-dev',
+        path='json',
+        data_files='./data/FewCLUE/bustm/dev_few_all.json',
+        split='train',
+        reader_cfg=bustm_reader_cfg,
+        infer_cfg=bustm_infer_cfg,
+        eval_cfg=bustm_eval_cfg),
+    dict(
+        type=HFDataset,
+        abbr='bustm-test',
+        path='json',
+        data_files='./data/FewCLUE/bustm/test_public.json',
+        split='train',
+        reader_cfg=bustm_reader_cfg,
+        infer_cfg=bustm_infer_cfg,
+        eval_cfg=bustm_eval_cfg)
+]
--- a/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_686c63.py
+++ b/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_686c63.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CHIDDataset_V2
+
+chid_reader_cfg = dict(
+    input_columns=["content","A","B","C","D","E","F","G"],
+    output_column="answer",
+)
+
+chid_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    "{content}\n请选择______处所填的词\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nF. {F}\nG. {G}\n请从”A“，”B“，”C“，”D“，”E“，”F“，”G“中进行选择。答：",
+                ),
+            ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+chid_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role="BOT",
+    pred_postprocessor=dict(type="first-capital"),
+)
+
+chid_datasets = [
+    dict(
+        abbr="chid-dev",
+        type=CHIDDataset_V2,
+        path="./data/FewCLUE/chid/dev_few_all.json",
+        reader_cfg=chid_reader_cfg,
+        infer_cfg=chid_infer_cfg,
+        eval_cfg=chid_eval_cfg,
+    ),
+    dict(
+        abbr="chid-test",
+        type=CHIDDataset_V2,
+        path="./data/FewCLUE/chid/test_public.json",
+        reader_cfg=chid_reader_cfg,
+        infer_cfg=chid_infer_cfg,
+        eval_cfg=chid_eval_cfg,
+    ),
+]
--- a/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_2a9e61.py
+++ b/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_2a9e61.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CluewscDataset
+
+cluewsc_reader_cfg = dict(
+    input_columns=['span1', 'span2', 'text', 'new_text'],
+    output_column='answer')
+
+cluewsc_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0:
+            dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt="{text}\n此处，代词“{span2}“被用于指代“{span1}“吗?"),
+                dict(role="BOT", prompt="否")
+            ]),
+            1:
+            dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt="{text}\n此处，代词“{span2}“被用于指代“{span1}“吗?"),
+                dict(role="BOT", prompt="是")
+            ]),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+cluewsc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+cluewsc_datasets = [
+    dict(
+        type=CluewscDataset,
+        path='json',
+        abbr='cluewsc-dev',
+        data_files='./data/FewCLUE/cluewsc/dev_few_all.json',
+        split='train',
+        reader_cfg=cluewsc_reader_cfg,
+        infer_cfg=cluewsc_infer_cfg,
+        eval_cfg=cluewsc_eval_cfg),
+    dict(
+        type=CluewscDataset,
+        path='json',
+        abbr='cluewsc-test',
+        data_files='./data/FewCLUE/cluewsc/test_public.json',
+        split='train',
+        reader_cfg=cluewsc_reader_cfg,
+        infer_cfg=cluewsc_infer_cfg,
+        eval_cfg=cluewsc_eval_cfg),
+]
--- a/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_d335d5.py
+++ b/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_d335d5.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CluewscDataset
+
+cluewsc_reader_cfg = dict(
+    input_columns=['span1', 'span2', 'text', 'new_text'],
+    output_column='answer')
+
+cluewsc_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0:
+            "{text}\nHere, is the pronoun \"{span2}\" used to mean \"{span1}\"? No.",
+            1:
+            "{text}\nHere, is the pronoun \"{span2}\" used to mean \"{span1}\"? Yes.",
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+cluewsc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+cluewsc_datasets = [
+    dict(
+        type=CluewscDataset,
+        path='json',
+        abbr='cluewsc-dev',
+        data_files='./data/FewCLUE/cluewsc/dev_few_all.json',
+        split='train',
+        reader_cfg=cluewsc_reader_cfg,
+        infer_cfg=cluewsc_infer_cfg,
+        eval_cfg=cluewsc_eval_cfg),
+    dict(
+        type=CluewscDataset,
+        path='json',
+        abbr='cluewsc-test',
+        data_files='./data/FewCLUE/cluewsc/test_public.json',
+        split='train',
+        reader_cfg=cluewsc_reader_cfg,
+        infer_cfg=cluewsc_infer_cfg,
+        eval_cfg=cluewsc_eval_cfg),
+]
--- a/configs/datasets/FewCLUE_csl/FewCLUE_csl_ppl.py
+++ b/configs/datasets/FewCLUE_csl/FewCLUE_csl_ppl.py
+from mmengine.config import read_base
+
+with read_base():
+    from .FewCLUE_csl_ppl_8eee08 import csl_datasets  # noqa: F401, F403