Commit c94cc943 authored by Leymore's avatar Leymore Committed by gaotong
Browse files

Add release contribution

parent e6b5bdcb
from mmengine.config import read_base
with read_base():
from .triviaqa_gen_cc3cbf import triviaqa_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import TriviaQADataset, TriviaQAEvaluator
triviaqa_reader_cfg = dict(
input_columns=['question'],
output_column='answer',
train_split='dev',
test_split='dev')
triviaqa_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=50))
triviaqa_eval_cfg = dict(
evaluator=dict(type=TriviaQAEvaluator), pred_role='BOT')
triviaqa_datasets = [
dict(
type=TriviaQADataset,
abbr='triviaqa',
path='./data/triviaqa/',
reader_cfg=triviaqa_reader_cfg,
infer_cfg=triviaqa_infer_cfg,
eval_cfg=triviaqa_eval_cfg)
]
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import TriviaQArcDataset, TriviaQAEvaluator
triviaqarc_reader_cfg = dict(
input_columns=['question', 'evidence'],
output_column='answer',
train_split='dev',
test_split='dev')
triviaqarc_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{evidence}\nAnswer these questions:\nQ: {question}?A:'),
dict(role='BOT', prompt=''),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(
type=GenInferencer, max_out_len=50, max_seq_len=8192, batch_size=4))
triviaqarc_eval_cfg = dict(
evaluator=dict(type=TriviaQAEvaluator), pred_role='BOT')
triviaqarc_datasets = [
dict(
type=TriviaQArcDataset,
abbr='triviaqarc',
path='./data/triviaqa-rc/',
reader_cfg=triviaqarc_reader_cfg,
infer_cfg=triviaqarc_infer_cfg,
eval_cfg=triviaqarc_eval_cfg)
]
from mmengine.config import read_base
with read_base():
from .truthfulqa_gen_0a3a53 import truthfulqa_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import TruthfulQADataset, TruthfulQAEvaluator
truthfulqa_reader_cfg = dict(
input_columns=['question'],
output_column='reference',
train_split='validation',
test_split='validation')
# TODO: allow empty output-column
truthfulqa_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[dict(role="HUMAN", prompt="{question}")])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
# Metrics such as 'truth' and 'info' needs
# OPENAI_API_KEY with finetuned models in it.
# Please use your own finetuned openai model with keys and refers to
# the source code for more details
# Metrics such as 'bleurt', 'rouge', 'bleu' are free to test
# When key is set to "ENV", the key will be fetched from the environment
# variable $OPENAI_API_KEY. Otherwise, set key in here directly.
truthfulqa_eval_cfg = dict(
evaluator=dict(
type=TruthfulQAEvaluator, metrics=('truth', 'info'), key='ENV'), )
truthfulqa_datasets = [
dict(
type=TruthfulQADataset,
path='truthful_qa',
name='generation',
reader_cfg=truthfulqa_reader_cfg,
infer_cfg=truthfulqa_infer_cfg,
eval_cfg=truthfulqa_eval_cfg)
]
from mmengine.config import read_base
with read_base():
from .z_bench_gen_5813ec import z_bench_dataset # noqa: F401, F403
agieval_summary_groups = []
_agieval_chinese_sets = ['gaokao-chinese', 'gaokao-english', 'gaokao-geography', 'gaokao-history', 'gaokao-biology', 'gaokao-chemistry', 'gaokao-physics', 'gaokao-mathqa', 'logiqa-zh', 'jec-qa-kd', 'jec-qa-ca', 'gaokao-mathcloze']
_agieval_chinese_sets = ['agieval-' + s for s in _agieval_chinese_sets]
agieval_summary_groups.append({'name': 'agieval-chinese', 'subsets': _agieval_chinese_sets})
_agieval_english_sets = ['lsat-ar', 'lsat-lr', 'lsat-rc', 'logiqa-en', 'sat-math', 'sat-en', 'sat-en-without-passage', 'aqua-rat', 'math']
_agieval_english_sets = ['agieval-' + s for s in _agieval_english_sets]
agieval_summary_groups.append({'name': 'agieval-english', 'subsets': _agieval_english_sets})
_agieval_gaokao_sets = ['gaokao-chinese', 'gaokao-english', 'gaokao-geography', 'gaokao-history', 'gaokao-biology', 'gaokao-chemistry', 'gaokao-physics', 'gaokao-mathqa', 'gaokao-mathcloze']
_agieval_gaokao_sets = ['agieval-' + s for s in _agieval_gaokao_sets]
agieval_summary_groups.append({'name': 'agieval-gaokao', 'subsets': _agieval_gaokao_sets})
_agieval_all = ['gaokao-chinese', 'gaokao-english', 'gaokao-geography', 'gaokao-history', 'gaokao-biology', 'gaokao-chemistry', 'gaokao-physics', 'gaokao-mathqa', 'logiqa-zh', 'lsat-ar', 'lsat-lr', 'lsat-rc', 'logiqa-en', 'sat-math', 'sat-en', 'sat-en-without-passage', 'aqua-rat', 'jec-qa-kd', 'jec-qa-ca', 'gaokao-mathcloze', 'math']
_agieval_all = ['agieval-' + s for s in _agieval_all]
agieval_summary_groups.append({'name': 'agieval', 'subsets': _agieval_all})
from mmengine.config import read_base
with read_base():
from .groups.agieval import agieval_summary_groups
from .groups.mmlu import mmlu_summary_groups
from .groups.ceval import ceval_summary_groups
from .groups.bbh import bbh_summary_groups
from .groups.GaokaoBench import GaokaoBench_summary_groups
from .groups.flores import flores_summary_groups
summarizer = dict(
dataset_abbrs = [
'--- Exam ---',
'agieval',
'mmlu-all-set',
"ceval",
"GaokaoBench",
"bbh",
'--- Coding ---',
'openai_humaneval',
'mbpp',
'--- ChineseUniversal ---',
'C3',
'CMRC_dev',
'DRCD_dev',
'afqmc-dev',
'cmnli',
'ocnli',
'bustm-dev',
'chid-dev',
'cluewsc-dev',
'csl_dev',
'eprstmt-dev',
'ocnli_fc-dev',
'tnews-dev',
'lcsts',
'--- Completion ---',
'lambada',
'story_cloze',
'--- EnglishUniversal ---',
'AX_b',
'AX_g',
'BoolQ',
'CB',
'COPA',
'MultiRC',
'RTE',
'ReCoRD',
'WiC',
'WSC',
'race-high',
'race-middle',
'--- NLG ---',
'Xsum',
'--- Reasoning ---',
'gsm8k',
'summedits',
'math',
'TheoremQA',
'--- QA ---',
'hellaswag',
'ARC-e',
'ARC-c',
'commonsense_qa',
'piqa',
'siqa',
'strategyqa',
'winogrande',
'openbookqa',
'openbookqa_fact',
'nq',
'triviaqa',
'--- Translation ---',
'flores_100_Indo-European-Germanic_English',
'flores_100_English_Indo-European-Germanic',
'flores_100_Indo-European-Romance_English',
'flores_100_English_Indo-European-Romance',
'flores_100_zho_simpl-eng',
'flores_100_eng-zho_simpl',
'--- Security ---',
'crows_pairs',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
prompt_db=dict(
database_path='configs/datasets/log.json',
config_dir='configs/datasets',
blacklist='.promptignore'),
)
from mmengine.config import read_base
with read_base():
from .groups.agieval import agieval_summary_groups
from .groups.mmlu import mmlu_summary_groups
from .groups.ceval import ceval_summary_groups
from .groups.bbh import bbh_summary_groups
from .groups.GaokaoBench import GaokaoBench_summary_groups
from .groups.flores import flores_summary_groups
summarizer = dict(
dataset_abbrs = [
'--- Exam ---',
'mmlu-all-set',
"ceval",
"bbh",
'--- ChineseUniversal ---',
'CMRC_dev',
'DRCD_dev',
'afqmc-dev',
'bustm-dev',
'chid-dev',
'cluewsc-dev',
'eprstmt-dev',
'--- Coding ---',
'openai_humaneval',
'mbpp',
'--- Completion ---',
'lambada',
'story_cloze',
'--- EnglishUniversal ---',
'AX_b',
'AX_g',
'BoolQ',
'CB',
'COPA',
'MultiRC',
'RTE',
'ReCoRD',
'WiC',
'WSC',
'race-high',
'race-middle',
'--- Reasoning ---',
'math',
'gsm8k',
'summedits',
'--- QA ---',
'hellaswag',
'piqa',
'winogrande',
'openbookqa',
'openbookqa_fact',
'nq',
'triviaqa',
'--- Security ---',
'crows_pairs',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
prompt_db=dict(
database_path='configs/datasets/log.json',
config_dir='configs/datasets',
blacklist='.promptignore'),
)
.header-logo {
background-image: url("../image/logo.png");
background-size: 183px 50px;
height: 50px;
width: 183px;
}
@media screen and (min-width: 1100px) {
.header-logo {
top: -12px;
}
}
pre {
white-space: pre;
}
@media screen and (min-width: 2000px) {
.pytorch-content-left {
width: 1200px;
margin-left: 30px;
}
article.pytorch-article {
max-width: 1200px;
}
.pytorch-breadcrumbs-wrapper {
width: 1200px;
}
.pytorch-right-menu.scrolling-fixed {
position: fixed;
top: 45px;
left: 1580px;
}
}
article.pytorch-article section code {
padding: .2em .4em;
background-color: #f3f4f7;
border-radius: 5px;
}
/* Disable the change in tables */
article.pytorch-article section table code {
padding: unset;
background-color: unset;
border-radius: unset;
}
table.autosummary td {
width: 50%
}
img.align-center {
display: block;
margin-left: auto;
margin-right: auto;
}
article.pytorch-article p.rubric {
font-weight: bold;
}
# Task Execution and Monitoring
## Initiation of Assessment Task
The program entry for the assessment task is `run.py`, its usage is as follows:
```shell
run.py [-p PARTITION] [-q QUOTATYPE] [--debug] [-m MODE] [-r [REUSE]] [-w WORKDIR] [-l LARK] config
```
The parameter explanation is as follows:
- -p Specify the slurm partition;
- -q Specify the slurm quotatype (default is auto), with optional values being reserved, auto, spot;
- --debug When enabled, inference and evaluation tasks will run in single-process mode, and output will be echoed in real-time for debugging;
- -m Run mode, default is all. It can be specified as infer to only run inference and obtain output results; if there are already model outputs in {WORKDIR}, it can be specified as eval to only run evaluation and obtain evaluation results; if there are individual evaluation results in results, it can be specified as viz to only run visualization; if specified as all, both inference and evaluation tasks run at the same time.
- -r Reuse existing inference results. If followed by a timestamp, the result under that timestamp in the workspace path will be reused; otherwise, the latest result in the specified workspace path will be reused.
- -w Specify the working path, default is ./outputs/default
- -l Enable status reporting via Lark bot.
Using run mode `-m all` as an example, the overall execution flow is as follows:
1. Read the configuration file, parse out the model, dataset, evaluator, and other configuration information
2. The evaluation task mainly includes three stages: inference infer, evaluation eval, and visualization viz. After task division by Partitioner, they are handed over to Runner for parallel execution. Individual inference and evaluation tasks are abstracted into OpenICLInferTask and OpenICLEvalTask respectively.
3. After each stage ends, the visualization stage will read the evaluation results in results to generate a visualization report.
## Task Monitoring: Lark Bot
Users can enable real-time monitoring of task status by setting up a Lark bot. Please refer to [this document](https://open.feishu.cn/document/ukTMukTMukTM/ucTM5YjL3ETO24yNxkjN?lang=zh-CN#7a28964d) for setting up the Lark bot.
Configuration method:
1. Open the `configs/lark.py` file, and add the following line:
```python
lark_bot_url = 'YOUR_WEBHOOK_URL'
```
Typically, the Webhook URL is formatted like this: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxxxx .
2. Inherit this file in the complete evaluation configuration:
```python
from mmengine.config import read_base
with read_base():
from .lark import lark_bot_url
```
3. To avoid frequent messages from the bot becoming a nuisance, status updates are not automatically reported by default. You can start status reporting using `-l` or `--lark` when needed:
```bash
python run.py configs/eval_demo.py -p {PARTITION} -l
```
## Introduction of Summerizer
It is mainly used to visualize evaluation results.
## Run Results
All run results will be placed in `outputs/default/` directory by default, the directory structure is shown below:
```
outputs/default/
├── 20200220_120000
├── ...
├── 20230220_183030
│ ├── configs
│ ├── logs
│ │ ├── eval
│ │ └── infer
│ ├── predictions
│ │ └── MODEL1
│ └── results
│ └── MODEL1
```
Each timestamp contains the following content:
- configs folder, which stores the configuration files corresponding to each run with this timestamp as the output directory;
- logs folder, which stores the output log files of the inference and evaluation phases, each folder will store logs in subfolders by model;
- predictions folder, which stores the inferred json results, with a model subfolder;
- results folder, which stores the evaluated json results, with a model subfolder.
Also, all `-r` without specifying a corresponding timestamp will select the newest folder by sorting as the output directory.
var collapsedSections = ['Advanced Guides', 'Tools', 'User Guides', 'Notes'];
$(document).ready(function () {
$('.model-summary').DataTable({
"stateSave": false,
"lengthChange": false,
"pageLength": 20,
"order": []
});
});
欢迎来到 OpenCompass 中文教程!
==========================================
OpenCompass 上手路线
-------------------------------
为了用户能够快速上手,我们推荐以下流程:
- 对于想要使用 OpenCompass 的用户,我们推荐先阅读 开始你的第一步_ 部分来设置环境。
- 对于一些基础使用,我们建议用户阅读 教程_ 。
- 若您想进行算法的自定义,我们提供了 进阶教程_ 。
- 如果您想调整提示语,您可以浏览 提示语_ 。
- 我们同样提供了 工具_ 。
我们始终非常欢迎用户的 PRs 和 Issues 来完善 OpenCompass!
.. _开始你的第一步:
.. toctree::
:maxdepth: 1
:caption: 开始你的第一步
get_started.md
.. _教程:
.. toctree::
:maxdepth: 1
:caption: 教程
user_guides/framework_overview.md
user_guides/config.md
user_guides/dataset_prepare.md
user_guides/models.md
user_guides/evaluation.md
user_guides/experimentation.md
user_guides/metrics.md
.. _进阶教程:
.. toctree::
:maxdepth: 1
:caption: 进阶教程
advanced_guides/new_dataset.md
advanced_guides/new_model.md
.. _提示语:
.. toctree::
:maxdepth: 1
:caption: 提示语
prompt/overview.md
prompt/few_shot.md
prompt/prompt_template.md
prompt/meta_template.md
.. _工具:
.. toctree::
:maxdepth: 1
:caption: 工具
tools.md
.. _其他说明:
.. toctree::
:maxdepth: 1
:caption: 其他说明
notes/contribution_guide.md
.. toctree::
:caption: 切换语言
English <https://mmpretrain.readthedocs.io/en/latest/>
简体中文 <https://mmpretrain.readthedocs.io/zh_CN/latest/>
索引与表格
==================
* :ref:`genindex`
* :ref:`search`
\ No newline at end of file
# Meta Prompt
## 背景
在 LLM 的实际 finetune 中,我们常常会根据实际的要求注入一些预定义的字符串,以求模型能按照自然语言的格式输出指定的内容。在评测时,我们也需要按照 finetune 时设定的格式输入问题,模型才能发挥出其最大的性能。因此,我们需要对 OpenICL 原本的 prompt 设计作一次增强,才能满足相应需求。
## Model - Meta Template
此前, prompt template 的设定绑定在数据集配置中。现在考虑到不同模型的 instruction 可能会有所不同,我们往 model config 中新增 `meta_template` 字段,允许用户指定与模型密切相关的 instruction。
```Python
models = [
dict(type='LLM',
# ...
meta_template = dict(
begin="meta instruction\nYou are an AI assistant.\n",
round=[
dict(role='HUMAN', begin='<|HUMAN|>:', end='脷\n'), # begin and end can be a list of strings or integers.
dict(role='THOUGHTS', begin='<|Inner Thoughts|>:', end='茔\n', prompt='None'),
dict(role='COMMANDS', begin='<|Commands|>:', end='蝮\n', prompt='None'),
dict(role='RESULTS', begin='<|Results|>:', end='兒\n', prompt='None'), # Here we can set the default prompt, which may be overridden by the speicfic dataset
dict(role='BOT', begin='<|MOSS|>:', generate=True, end='氡\n'),
],
end="end of conversion",
reserved_roles=[dict(role='SYSTEM', begin='<|SYSTEM|>: ', end='\n'),],
# the token to stop the generation tasks (TODO: support string)
eos_token_id=65605,
),
)
]
```
这里,meta_template 是一个**字典**,该字典可以包含以下数个字段:
- `begin``end` :(str,可选) prompt 的开头,通常是一些 meta instruction。
- `round`:(list,可选) 约定了每一轮对话的 prompt 格式。每轮对话的 prompt 内容由 dataset config 中的 prompt template 控制(下文会详述)。如果不指定,则该字段将会直接被 dataset config 中的 prompt template 替换。
- (str,可选):收尾的 instruction。
- `reserved_roles` (list,可选)指定了在 meta template 中并未出现的预留角色。这里面定义的角色有可能在 dataset config 的 begin 或 end 中用到,例如 `SYSTEM` 角色。
- `eos_token_id` (int, 可选):指定了该模型在生成式任务中 eos token 的 id。如果不设置,则默认为 tokenizer 中的 eos token id。
`round` 指定了每轮对话中每个角色说话的格式,通常接受一个列表,内容可以是 **str 或 dict**。每个字典接受以下关键字:
- `role`(str): 对话中的角色,也可以认为是这个 prompt 的 identifier。该字符串并不影响实际的 prompt,仅用于在 dataset_config 中的指定对应项,并对其 prompt 内容进行覆盖。
- `begin`, `end` (str): 指定该角色在说话时的开头或结尾。
- `prompt` (str):prompt 的内容,遵循 `ICLPromptTemplate` 的格式规范。如果在 meta_prompt_template 中未指定,则必须在 dataset config 中的 prompt template 中指定。
- `generate` (bool): 指定为 True 时,该角色即为模型在生成任务中开始生成输出的位点。在生成任务中生成对应 prompt 时,prompt template 只会生成到该角色的 begin,剩下的内容由模型补全。
在上面的例子中,最后的 meta prompt 将会是:
```
meta instructionYou are an AI assistant.
<|HUMAN|>: 脷\n
<|Inner Thoughts|>: None茔\n<|Commands|>: None蝮\n<|Results|>: None兒\n
<|MOSS|>: 氡\n
end of conversion
```
特别地,在生成式任务中,prompt 仅会生成到 \<|MOSS|>: 后:
```
meta instructionYou are an AI assistant.
<|HUMAN|>: 脷\n
<|Inner Thoughts|>: None茔\n<|Commands|>: None蝮\n<|Results|>: None兒\n
<|MOSS|>:
```
接下来我们在 dataset config 中进行进一步约定。
## Dataset: Prompt Template
在 model 配置中约定了该 model 所需的 meta template 后,dataset 中 prompt template 的格式也会有所变化。同时,该方向尽可能地保持了 prompt 的 backward compatibility。
在改动前,`PromptTemplate` 接受 str 或 dict 作为输入。其中,dict 形式的输入将 label string 映射到对应的 prompt (str)上,通常用作为 `PPLInferencer` 的输入。因而本质上,`PromptTemplate` 的旧版实现里表示 prompt 的方式只有 `str` 一种。
而改动后的 prompt template 允许接受的 prompt 基本形式从 str 扩展到了 dict。
这个 dict 的格式与 meta template 相似,用户也可以指定 `begin`, `end``round` 关键字:
```Python
mmlu_prompt_template = dict(
type='PromptTemplate',
template=dict(
begin=[dict(role='SYSTEM', fallback_role='HUMAN', prompt='The following are '
'multiple choice questions (with answers) about physics.'),
'</E>',
],
round=[
dict(role='HUMAN', prompt='</input>\nA. </A>\nB. </B>\nC. </C>\nD. </D>\nAnswer: '),
dict(role='BOT', prompt='</target>'),
],
end="end of dataset prompt template."
),
column_token_map={
'input': '</input>',
'A': '</A>',
'B': '</B>',
'C': '</C>',
'D': '</D>',
'target': '</target>'
},
ice_token='</E>',
)
```
其中,`round`用于指定在每轮对话中角色的 prompt 格式,同时也是为了呼应和补全 meta template 中的配置,因此,其接受的参数和规则均与 meta template 中的 `round` 一致。**在实际运行时,两处 prompt 的配置将会融合,同时如果某一字段被重复定义,则以 dataset config 中定义为准。**
`begin``end` 则除了支持 str 类型的输入,也支持 list 类型的输入,在其中用户可以通过组合 dict 和字符串实现对系统角色的融合。留意到例子中引入了 `fallback_role` 的设定,意味着若系统在 meta template 中 reserved_roles 中找不到 `role` 中的角色时,会自动替换成 `fallback_role` 中的角色。这个特征的设立是为了尽可能确保 prompt 模板的通用性。
结合 meta template,最终生成的 prompt 模板为:
```Plain
meta instruction
You are an AI assistant.
<|SYSTEM|>: The following are multiple choice questions (with answers) about college biology.
<|HUMAN|>: Which of the following is NOT a characteristic of an oligotrophic lake?
A. Low nutrient levels
B. High altitudes
C. Shallow water
D. Sand or gravel bottom
Answer: 脷\n
<|Inner Thoughts|>: None茔\n
<|Commands|>: None蝮\n
<|Results|>: None兒\n
<|MOSS|>: A氡\n
end of dataset prompt template.
end of conversion
```
特别地,由于这种 prompt 的数据结构(dict)与旧版的 label -> prompt 映射相同,本实现仅在字典的 keys 为 {`begin`, `round`, `end`} 的子集时将 prompt 的输入以新版规则进行解码,否则依然将字典以 label -> prompt 的形式进行解码。此外,该方案也允许新版 prompt 字典嵌套在旧版的 label -> prompt 字典中。例如,以下表达方式也是合法的 (摘自 `configs/datasets/mmlu.py`):
```Python
prompt_template={
target:
dict(
begin=[dict(role='SYSTEM', fallback_role='HUMAN', prompt='The following are '
'multiple choice questions (with answers) about '
f'{name.replace("_", " ")}.\n'),
'</E>',
],
round=[
dict(role='HUMAN', prompt='</input>\nA. </A>\nB. </B>\nC. </C>\nD. </D>\nAnswer: '),
dict(role='BOT', prompt=f'{target}'),
]
)
for target in ['A', 'B', 'C', 'D'] # use the actual answer
}
```
### 无 meta template 时
为了保证后向兼容性,当用户未在 model config 中指定 meta template 时,`ICLPromptTemplate` 会将每个 dict 按照 `begin`, `prompt`, `end` 的顺序拼接为普通字符串。
### 多轮对话例子
在某些时候,一轮完整的交互中可能需要包含多轮对话。用户可以参考 `configs/datasets/gsm8k.py` 配置自己的模板。
# 实用工具
## Prompt Viewer
本工具允许你在不启动完整训练流程的情况下,直接查看模型会接收到的 prompt。
运行方式:
```bash
python tools/prompt_viewer.py [CONFIG_PATH]
```
## Case Analyzer
本工具在已有评测结果的基础上,产出推理错误样本以及带有标注信息的全量样本
运行方式:
```bash
python tools/case_analyzer.py [CONFIG_PATH] [-w WORK_DIR]
```
- `-w`:工作路径,默认为 `'./outputs/default'`
更多细节见 [飞书文档](https://aicarrier.feishu.cn/docx/SgrLdwinion00Kxkzh2czz29nIh)
## Lark Bot
用户可以通过配置飞书机器人,实现任务状态的实时监控。飞书机器人的设置文档请[参考这里](https://open.feishu.cn/document/ukTMukTMukTM/ucTM5YjL3ETO24yNxkjN?lang=zh-CN#7a28964d)
配置方式:
- 打开 `configs/secrets.py` 文件,并在文件中加入以下行:
```python
lark_bot_url = 'YOUR_WEBHOOK_URL'
```
通常, Webhook URL 格式如 https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxxxx 。
- 在完整的评测配置中继承该文件:
```python
_base_ = [
'secrets.py',
...
]
```
实例可见 `configs/eval.py`
- 为了避免机器人频繁发消息形成骚扰,默认运行时状态不会自动上报。有需要时,可以通过 `-l``--lark` 启动状态上报:
```bash
python run.py configs/eval_demo.py -p {PARTITION} -l
```
## API Model Tests
本工具可以快速测试 API Wrapper 的功能是否正常。
运行方式:
```bash
python tools/test_api_model.py [CONFIG_PATH]
```
# 数据集准备和选择
本节教程主要关注如何准备 OpenCompass 已支持的数据集,并构建需要的配置文件完成数据集的选择。
## 数据集配置文件目录结构
首先简单介绍一下 OpenCompass `configs/datasets` 目录下的结构,如下所示:
```
configs/datasets/
├── ChineseUniversal # 能力维度
│   ├── CLUE_afqmc # 该维度下的数据集
│   │   ├── CLUE_afqmc_gen_db509b.py # 该数据集的不同配置文件
│   │   ├── CLUE_afqmc_gen.py
│   │   ├── CLUE_afqmc_ppl_00b348.py
│   │   ├── CLUE_afqmc_ppl_2313cf.py
│   │   └── CLUE_afqmc_ppl.py
│   ├── CLUE_C3
│   │   ├── ...
│   ├── ...
├── Coding
├── collections
├── Completion
├── EnglishUniversal
├── Exam
├── glm
├── LongText
├── MISC
├── NLG
├── QA
├── Reasoning
├── Security
└── Translation
```
`configs/datasets` 目录结构下,我们主要以能力维度对数据集划分了十余项维度,例如:中英文通用、考试、问答、推理、安全等等。每一项维度又包含了一系列数据集,在各个数据集对应的文件夹下存在多个数据集配置。
数据集配置文件名由以下命名方式构成 `{数据集名称}_{评测方式}_{prompt版本号}.py`,以 `ChineseUniversal/CLUE_afqmc/CLUE_afqmc_gen_db509b.py` 为例,该配置文件则为中文通用能力下的 `CLUE_afqmc` 数据集,对应的评测方式为 `gen`,即生成式评测,对应的prompt版本号为 `db509b`;同样的, `CLUE_afqmc_ppl_00b348.py` 指评测方式为`ppl`即判别式评测,prompt版本号为 `00b348`
除此之外,不带版本号的文件,例如: `CLUE_afqmc_gen.py` 则指向该评测方式最新的prompt配置文件,通常来说会是精度最高的prompt。
## 数据集准备
OpenCompass 支持的数据集主要包括两个部分:
1. Huggingface 数据集
[Huggingface Dataset](https://huggingface.co/datasets) 提供了大量的数据集。OpenCompass 已经支持了大多数常用于性能比较的数据集,具体支持的数据集列表请直接在 `configs/dataset` 下进行查找。
2. OpenCompass 自建数据集
除了支持 Huggingface 已有的数据集, OpenCompass 还提供了一些自建CN数据集,未来将会提供一个数据集相关的Repo供用户下载使用。按照文档指示将数据集统一放置在`./data`目录下即可完成数据集准备。
需要注意的是,Repo中不仅包含自建的数据集,为了方便也加入了部分HF已支持的数据集方便测试。
## 数据集选择
在各个数据集配置文件中,数据集将会被定义在 `{}_datasets` 变量当中,例如下面 `ChineseUniversal/CLUE_afqmc/CLUE_afqmc_gen_db509b.py` 中的 `afqmc_datasets`
```python
afqmc_datasets = [
dict(
abbr="afqmc-dev",
type=AFQMCDataset_V2,
path="./data/CLUE/AFQMC/dev.json",
reader_cfg=afqmc_reader_cfg,
infer_cfg=afqmc_infer_cfg,
eval_cfg=afqmc_eval_cfg,
),
]
```
以及 `ChineseUniversal/CLUE_cmnli/CLUE_cmnli_ppl_b78ad4.py` 中的 `afqmc_datasets`
```python
cmnli_datasets = [
dict(
type=HFDataset,
abbr='cmnli',
path='json',
split='train',
data_files='./data/CLUE/cmnli/cmnli_public/dev.json',
reader_cfg=cmnli_reader_cfg,
infer_cfg=cmnli_infer_cfg,
eval_cfg=cmnli_eval_cfg)
]
```
以上述两个数据集为例, 如果用户想同时评测这两个数据集,可以在 `configs` 目录下新建一个配置文件,我们使用 `mmengine` 配置中直接import的机制来构建数据集部分的参数,如下所示:
```python
from mmengine.config import read_base
with read_base():
from .datasets.CLUE_afqmc.CLUE_afqmc_gen_db509b import afqmc_datasets
from .datasets.CLUE_cmnli.CLUE_cmnli_ppl_b78ad4 import cmnli_datasets
datasets = []
datasets += afqmc_datasets
datasets += cmnli_datasets
```
用户可以根据需要,选择不同能力不同数据集以及不同评测方式的配置文件来构建评测脚本中数据集的部分。
有关如何启动评测任务,以及如何评测自建数据集可以参考相关文档。
import json
import re
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class GaokaoBenchDataset(BaseDataset):
@staticmethod
def load(path: str):
with open(path) as f:
data = json.load(f)
return Dataset.from_list(data['example'])
valid_gaokao_bench_question_types = [
'single_choice', 'multi_choice', 'multi_question_choice',
'five_out_of_seven', 'cloze', 'subjective', 'correction'
]
class GaokaoBenchEvaluator(BaseEvaluator):
def __init__(self, question_type) -> None:
super().__init__()
assert question_type in valid_gaokao_bench_question_types
self.question_type = question_type
def do_predictions_postprocess(self, model_output, answer_lenth=None):
if self.question_type == 'single_choice':
model_answer = []
temp = re.findall(r'[A-D]', model_output[::-1])
if len(temp) != 0:
model_answer.append(temp[0])
elif self.question_type == 'multi_question_choice':
model_answer = []
temp = re.findall(r'【答案】\s*[::]*\s*[A-Z]', model_output)
if len(temp) == answer_lenth:
for t in temp:
model_answer.append(re.findall(r'[A-Z]', t)[0])
else:
temp = re.findall(r'[A-Z]', model_output)
if len(temp) > 0:
for k in range(min(len(temp), answer_lenth)):
model_answer.append(temp[k])
elif self.question_type == 'multi_choice':
model_answer = []
answer = ''
content = re.sub(r'\s+', '', model_output)
answer_index = content.find('【答案】')
if answer_index > 0:
temp = content[answer_index:]
if len(re.findall(r'[A-D]', temp)) > 0:
for t in re.findall(r'[A-D]', temp):
answer += t
else:
temp = content[-10:]
if len(re.findall(r'[A-D]', temp)) > 0:
for t in re.findall(r'[A-D]', temp):
answer += t
if len(answer) != 0:
model_answer.append(answer)
elif self.question_type == 'five_out_of_seven':
model_answer = []
temp = re.findall(r'[A-G]', model_output)
if len(temp) > 0:
for k in range(min(5, len(temp))):
model_answer.append(temp[k])
return model_answer
def ensure_same_length(self, pred, refr):
if len(pred) == len(refr):
return pred
return ['Z'] * len(refr)
def score(self, predictions, references):
if self.question_type not in [
'single_choice', 'multi_choice', 'multi_question_choice',
'five_out_of_seven'
]:
return {'score': 0}
elif self.question_type == 'multi_choice':
correct_score, total_score = 0, 0
for pred, refr in zip(predictions, references):
pred = self.do_predictions_postprocess(pred)
pred = self.ensure_same_length(pred, refr)
for p, r in zip(pred, refr):
if p == r:
correct_score += 2
else:
for i in p:
if i not in r:
break
else:
correct_score += 1
total_score += 2
return {'score': correct_score / total_score * 100}
else:
correct_score, total_score = 0, 0
for pred, refr in zip(predictions, references):
if self.question_type == 'multi_question_choice':
pred = self.do_predictions_postprocess(pred, len(refr))
else:
pred = self.do_predictions_postprocess(pred)
pred = self.ensure_same_length(pred, refr)
for p, r in zip(pred, refr):
if p == r:
correct_score += 1
total_score += 1
return {'score': correct_score / total_score * 100}
for question_type in valid_gaokao_bench_question_types:
# fix classic closure problem
def _gaokao_register(question_type):
ICL_EVALUATORS.register_module(
name='GaokaoBenchEvaluator' + '_' + question_type,
module=lambda *args, **kwargs: GaokaoBenchEvaluator(
question_type=question_type, *args, **kwargs))
_gaokao_register(question_type)
import json
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class AFQMCDataset_V2(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r') as f:
for line in f:
line = json.loads(line)
line['label'] = 'AB'[int(line['label'])]
data.append(line)
return Dataset.from_list(data)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment