"gallery/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "85b7858075d214f27c8e981cd9fea28a8f95249c"
Unverified Commit 637628a7 authored by Songyang Zhang's avatar Songyang Zhang Committed by GitHub
Browse files

[Doc] Update Doc for Alignbench (#707)

* update alignmentbench

* update alignmentbench

* update doc

* update

* update
parent d7e7a637
from opencompass.models import OpenAI
models = [
dict(abbr='GPT-3.5-turbo',
type=OpenAI, path='gpt-3.5-turbo', key='sk-xxx',
max_out_len=2048, max_seq_len=2048, batch_size=1)
]
from opencompass.models import OpenAI
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
)
models = [
dict(abbr='GPT-3.5-turbo-0613',
type=OpenAI, path='gpt-3.5-turbo-0613',
key='ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048, max_seq_len=4096, batch_size=8),
]
from opencompass.models import OpenAI
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
)
models = [
dict(abbr='GPT-3.5-turbo-0613',
type=OpenAI, path='gpt-3.5-turbo-0613',
key='ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048, max_seq_len=4096, batch_size=8),
]
...@@ -6,13 +6,25 @@ Subjective evaluation aims to assess the model's performance in tasks that align ...@@ -6,13 +6,25 @@ Subjective evaluation aims to assess the model's performance in tasks that align
To explore the model's subjective capabilities, we employ JudgeLLM as a substitute for human assessors ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)). To explore the model's subjective capabilities, we employ JudgeLLM as a substitute for human assessors ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)).
A popular evaluation method involves comparing model responses pairwise to calculate their win rate, another method involves calculate scores with single model response ([Chatbot Arena](https://chat.lmsys.org/)). A popular evaluation method involves
- Compare Mode: comparing model responses pairwise to calculate their win rate
- Score Mode: another method involves calculate scores with single model response ([Chatbot Arena](https://chat.lmsys.org/)).
We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of models based on above methods. We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of models based on above methods.
## Data Preparation ## Subjective Evaluation with Custom Dataset
The specific process includes:
1. Data preparation
2. Model response generation
3. Evaluate the response with a JudgeLLM
4. Generate JudgeLLM's response and calculate the metric
### Step-1: Data Preparation
We provide demo test set as below: We provide mini test-set for **Compare Mode** and **Score Mode** as below:
```python ```python
###COREV2 ###COREV2
...@@ -44,62 +56,150 @@ The json must includes the following fields: ...@@ -44,62 +56,150 @@ The json must includes the following fields:
If you want to modify prompt on each single question, you can full some other information into 'others' and construct it. If you want to modify prompt on each single question, you can full some other information into 'others' and construct it.
## Evaluation Configuration ### Step-2: Evaluation Configuration(Compare Mode)
The specific process includes: For `config/eval_subjective_compare.py`, we provide some annotations to help users understand the configuration file.
1. Model response reasoning
2. JudgeLLM evaluation comparisons
3. Generating evaluation reports
### Two Model Compare Configuration
For `config/subjective_compare.py`, we provide some annotations to help users understand the configuration file's meaning.
```python ```python
from mmengine.config import read_base
with read_base():
from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import Corev2Summarizer from opencompass.summarizers import Corev2Summarizer
datasets = [*subjective_datasets] #set dataset with read_base():
models = [...] #set models to be evaluated # Pre-defined models
judge_model = [...] #set JudgeLLM from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
from .models.openai.gpt_4 import models as gpt4_model
from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
# Evaluation datasets
datasets = [*subjective_datasets]
# Model to be evaluated
models = [*hf_qwen_7b_chat, *hf_chatglm3_6b]
# Inference configuration
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
# Evaluation configuration
eval = dict( eval = dict(
partitioner=dict( partitioner=dict(
type=SubjectiveNaivePartitioner, type=SubjectiveNaivePartitioner,
mode='m2n', #choose eval mode, in m2n mode,you need to set base_models and compare_models, it will generate the pairs between base_models and compare_models mode='m2n', # m-model v.s n-model
base_models = [...], # Under m2n setting
compare_models = [...] # must specify base_models and compare_models, program will generate pairs between base_models compare_models.
)) base_models = [*hf_qwen_14b_chat], # Baseline model
compare_models = [*hf_baichuan2_7b, *hf_chatglm3_6b] # model to be evaluated
work_dir = 'Your work dir' #set your workdir, in this workdir, if you use '--reuse', it will reuse all existing results in this workdir automatically ),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(
type=SubjectiveEvalTask,
judge_cfg=gpt4_model # Judge model
)),
)
work_dir = './outputs/subjective/'
summarizer = dict( summarizer = dict(
type=Corev2Summarizer, #Your dataset Summarizer type=Corev2Summarizer, # Custom summarizer
match_method='smart', #Your answer extract method match_method='smart', # Answer extraction
) )
``` ```
In addition, you can also change the response order of the two models, please refer to `config/subjective_compare.py`, In addition, you can also change the response order of the two models, please refer to `config/eval_subjective_compare.py`,
when `infer_order` is setting to `random`, the response will be random ordered, when `infer_order` is setting to `random`, the response will be random ordered,
when `infer_order` is setting to `double`, the response of two models will be doubled in two ways. when `infer_order` is setting to `double`, the response of two models will be doubled in two ways.
### Single Model Scoring Configuration ### Step-2: Evaluation Configuration(Score Mode)
For `config/subjective_score.py`, it is mainly same with `config/subjective_compare.py`, and you just need to modify the eval mode to `singlescore`. For `config/eval_subjective_score.py`, it is mainly same with `config/eval_subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.
## Launching the Evaluation ### Step-3: Launch the Evaluation
```shell ```shell
python run.py config/subjective.py -r python run.py config/eval_subjective_score.py -r
``` ```
The `-r` parameter allows the reuse of model inference and GPT-4 evaluation results. The `-r` parameter allows the reuse of model inference and GPT-4 evaluation results.
## Evaluation Report
The response of JudgeLLM will be output to `output/.../results/timestamp/xxmodel/xxdataset/.json`. The response of JudgeLLM will be output to `output/.../results/timestamp/xxmodel/xxdataset/.json`.
The evaluation report will be output to `output/.../summary/timestamp/report.csv`. The evaluation report will be output to `output/.../summary/timestamp/report.csv`.
## Practice: AlignBench Evaluation
### Dataset
```bash
mkdir -p ./data/subjective/
cd ./data/subjective
git clone https://github.com/THUDM/AlignBench.git
# data format conversion
python ../../../tools/convert_alignmentbench.py --mode json --jsonl data/data_release.jsonl
```
### Configuration
Please edit the config `configs/eval_subjective_alignbench.py` according to your demand.
### Evaluation
```bash
HF_EVALUATE_OFFLINE=1 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python run.py workspace/eval_subjective_alignbench.py
```
### Submit to Official Leaderboard(Optional)
If you need to submit your prediction into official leaderboard, you can use `tools/convert_alignmentbench.py` for format conversion.
- Make sure you have the following results
```bash
outputs/
└── 20231214_173632
├── configs
├── logs
├── predictions # model's response
├── results
└── summary
```
- Convert the data
```bash
python tools/convert_alignmentbench.py --mode csv --exp-folder outputs/20231214_173632
```
- Get `.csv` in `submission/` for submission
```bash
outputs/
└── 20231214_173632
├── configs
├── logs
├── predictions
├── results
├── submission # 可提交文件
└── summary
```
...@@ -6,16 +6,28 @@ ...@@ -6,16 +6,28 @@
为了探究模型的主观能力,我们采用了JudgeLLM作为人类评估者的替代品([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685))。 为了探究模型的主观能力,我们采用了JudgeLLM作为人类评估者的替代品([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685))。
流行的评估方法主要有: 1.将模型的回答进行两两比较,以计算其胜率, 2.针对单模型的回答进行打分([Chatbot Arena](https://chat.lmsys.org/))。 流行的评估方法主要有:
- Compare模式:将模型的回答进行两两比较,以计算对战其胜率。
- Score模式:针对单模型的回答进行打分(例如:[Chatbot Arena](https://chat.lmsys.org/))。
我们基于以上方法支持了JudgeLLM用于模型的主观能力评估(目前opencompass仓库里支持的所有模型都可以直接作为JudgeLLM进行调用,此外一些专用的JudgeLLM我们也在计划支持中)。 我们基于以上方法支持了JudgeLLM用于模型的主观能力评估(目前opencompass仓库里支持的所有模型都可以直接作为JudgeLLM进行调用,此外一些专用的JudgeLLM我们也在计划支持中)。
## 数据准备 ## 自定义主观数据集评测
主观评测的具体流程包括:
1. 评测数据集准备
2. 使用API模型或者开源模型进行问题答案的推理
3. 使用选定的评价模型(JudgeLLM)对模型输出进行评估
4. 对评价模型返回的预测结果进行解析并计算数值指标
对于两回答比较和单回答打分两种方法,我们各提供了一个demo测试集如下: ### 第一步:数据准备
对于对战模式和打分模式,我们各提供了一个demo测试集如下:
```python ```python
###COREV2 ### 对战模式示例
[ [
{ {
"question": "如果我在空中垂直抛球,球最初向哪个方向行进?", "question": "如果我在空中垂直抛球,球最初向哪个方向行进?",
...@@ -27,7 +39,7 @@ ...@@ -27,7 +39,7 @@
} }
},...] },...]
###CreationV0.1 ### 打分模式数据集示例
[ [
{ {
"question": "请你扮演一个邮件管家,我让你给谁发送什么主题的邮件,你就帮我扩充好邮件正文,并打印在聊天框里。你需要根据我提供的邮件收件人以及邮件主题,来斟酌用词,并使用合适的敬语。现在请给导师发送邮件,询问他是否可以下周三下午15:00进行科研同步会,大约200字。", "question": "请你扮演一个邮件管家,我让你给谁发送什么主题的邮件,你就帮我扩充好邮件正文,并打印在聊天框里。你需要根据我提供的邮件收件人以及邮件主题,来斟酌用词,并使用合适的敬语。现在请给导师发送邮件,询问他是否可以下周三下午15:00进行科研同步会,大约200字。",
...@@ -44,62 +56,150 @@ ...@@ -44,62 +56,150 @@
以上三个字段是必要的,用户也可以添加其他字段,如果需要对每个问题的prompt进行单独处理,可以在'others'字段中进行一些额外设置,并在Dataset类中添加相应的字段。 以上三个字段是必要的,用户也可以添加其他字段,如果需要对每个问题的prompt进行单独处理,可以在'others'字段中进行一些额外设置,并在Dataset类中添加相应的字段。
## 评测配置 ### 第二步:构建评测配置(对战模式)
具体流程包括:
1. 模型回答的推理 对于两回答比较,更详细的config setting请参考 `config/eval_subjective_compare.py`,下面我们提供了部分简略版的注释,方便用户理解配置文件的含义。
2. JudgeLLM评估
3. 生成评测报告
### 两回答比较配置
对于两回答比较,更详细的config setting请参考 `config/subjective_compare.py`,下面我们提供了部分简略版的注释,方便用户理解配置文件的含义。
```python ```python
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import Corev2Summarizer from opencompass.summarizers import Corev2Summarizer
datasets = [*subjective_datasets] #指定需要评测的数据集 with read_base():
models = [...] #指定需要评测的模型 # 导入预设模型
judge_model = [...] #指定JudgeLLM from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
from .models.openai.gpt_4 import models as gpt4_model
from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
# 评测数据集
datasets = [*subjective_datasets]
# 待测模型列表
models = [*hf_qwen_7b_chat, *hf_chatglm3_6b]
# 推理配置
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
# 评测配置
eval = dict( eval = dict(
partitioner=dict( partitioner=dict(
type=SubjectiveNaivePartitioner, type=SubjectiveNaivePartitioner,
mode='m2n', #选择评测模式,在m2n模式下,需要指定base_models和compare_models,将会对base_models和compare_models生成对应的两两pair(去重且不会与自身进行比较) mode='m2n', # m个模型 与 n个模型进行对战
base_models = [...], # 在m2n模式下,需要指定base_models和compare_models,将会对base_models和compare_models生成对应的两两pair(去重且不会与自身进行比较)
compare_models = [...] base_models = [*hf_qwen_14b_chat], # 用于对比的基线模型
)) compare_models = [*hf_baichuan2_7b, *hf_chatglm3_6b] # 待评测模型
),
work_dir = 'Your work dir' #指定工作目录,在此工作目录下,若使用--reuse参数启动评测,将自动复用该目录下已有的所有结果 runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(
type=SubjectiveEvalTask,
judge_cfg=gpt4_model # 评价模型
)),
)
work_dir = './outputs/subjective/' #指定工作目录,在此工作目录下,若使用--reuse参数启动评测,将自动复用该目录下已有的所有结果
summarizer = dict( summarizer = dict(
type=Corev2Summarizer, #自定义数据集Summarizer type=Corev2Summarizer, #自定义数据集Summarizer
match_method='smart' #自定义答案提取方式 match_method='smart', #自定义答案提取方式
) )
``` ```
此外,在数据集的配置config中,还可以选择两回答比较时的回答顺序,请参考`config/subjective_compare.py`, 此外,在数据集的配置config中,还可以选择两回答比较时的回答顺序,请参考`config/eval_subjective_compare.py`,
`infer_order`设置为`random`时,将对两模型的回复顺序进行随机打乱, `infer_order`设置为`random`时,将对两模型的回复顺序进行随机打乱,
`infer_order`设置为`double`时,将把两模型的回复按两种先后顺序进行判断。 `infer_order`设置为`double`时,将把两模型的回复按两种先后顺序进行判断。
### 单回答打分配置 ### 第二步:构建评测配置(打分模式)
对于单回答打分,更详细的config setting请参考 `config/subjective_score.py`,该config的大部分都与两回答比较的config相同,只需要修改评测模式即可,将评测模式设置为`singlescore` 对于单回答打分,更详细的config setting请参考 `config/eval_subjective_score.py`,该config的大部分都与两回答比较的config相同,只需要修改评测模式即可,将评测模式设置为`singlescore`
## 启动评测 ### 第三步 启动评测并输出评测结果
```shell ```shell
python run.py configs/subjective_score.py -r python run.py configs/eval_subjective_score.py -r
``` ```
`-r` 参数支持复用模型推理和评估结果。 - `-r` 参数支持复用模型推理和评估结果。
## 评测报告
JudgeLLM的评测回复会保存在 `output/.../results/timestamp/xxmodel/xxdataset/.json` JudgeLLM的评测回复会保存在 `output/.../results/timestamp/xxmodel/xxdataset/.json`
评测报告则会输出到 `output/.../summary/timestamp/report.csv` 评测报告则会输出到 `output/.../summary/timestamp/report.csv`
## 实战:AlignBench 主观评测
### 数据集准备
```bash
mkdir -p ./data/subjective/
cd ./data/subjective
git clone https://github.com/THUDM/AlignBench.git
# data format conversion
python ../../../tools/convert_alignmentbench.py --mode json --jsonl data/data_release.jsonl
```
### 配置文件
请根据需要修改配置文件 `configs/eval_subjective_alignbench.py`
### 启动评测
按如下方式执行命令后,将会开始答案推理和主观打分,如只需进行推理,可以通过制定 `-m infer`实现
```bash
HF_EVALUATE_OFFLINE=1 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python run.py configs/eval_subjective_alignbench.py
```
### 提交官方评测(Optional)
完成评测后,如需提交官方榜单进行评测,可以使用它`tools/convert_alignmentbench.py`进行格式转换。
- 请确保已完成推理,并获得如下所示的文件:
```bash
outputs/
└── 20231214_173632
├── configs
├── logs
├── predictions # 模型回复
├── results
└── summary
```
- 执行如下命令获得可用于提交的结果
```bash
python tools/convert_alignmentbench.py --mode csv --exp-folder outputs/20231214_173632
```
- 进入 `submission`文件夹获得可用于提交的`.csv`文件
```bash
outputs/
└── 20231214_173632
├── configs
├── logs
├── predictions
├── results
├── submission # 可提交文件
└── summary
```
...@@ -384,6 +384,7 @@ class OpenAIAllesAPIN(OpenAI): ...@@ -384,6 +384,7 @@ class OpenAIAllesAPIN(OpenAI):
elif item['role'] == 'SYSTEM': elif item['role'] == 'SYSTEM':
msg['role'] = 'system' msg['role'] = 'system'
messages.append(msg) messages.append(msg)
# model can be response with user and system # model can be response with user and system
# when it comes with agent involved. # when it comes with agent involved.
assert msg['role'] in ['user', 'system'] assert msg['role'] in ['user', 'system']
......
...@@ -2,29 +2,38 @@ import argparse ...@@ -2,29 +2,38 @@ import argparse
import csv import csv
import json import json
import os import os
from glob import glob
from tqdm import tqdm
def extract_predictions_from_json(input_path, file_name):
for root, dirs, files in os.walk(input_path):
for file in files:
if file == f'{file_name}.json':
file_path = os.path.join(root, file)
output_csv = os.path.join(root, f'{file_name}.csv')
with open(file_path, 'r', encoding='utf-8') as json_file: def extract_predictions_from_json(input_folder):
data = json.load(json_file)
predictions = []
for key in data: sub_folder = os.path.join(input_folder, 'submission')
prediction = data[key].get('prediction', '') pred_folder = os.path.join(input_folder, 'predictions')
predictions.append(prediction) if not os.path.exists(sub_folder):
os.makedirs(sub_folder)
with open(output_csv, 'w', newline='', for model_name in os.listdir(pred_folder):
encoding='utf-8') as csv_file: model_folder = os.path.join(pred_folder, model_name)
writer = csv.writer(csv_file) json_paths = glob(os.path.join(model_folder, 'alignment_bench_*.json'))
# sorted by index
json_paths = sorted(
json_paths, key=lambda x: int(x.split('.json')[0].split('_')[-1]))
all_predictions = []
for json_ in json_paths:
json_data = json.load(open(json_))
for _, value in json_data.items():
prediction = value['prediction']
all_predictions.append(prediction)
for prediction in predictions: # for prediction
writer.writerow([prediction]) output_path = os.path.join(sub_folder, model_name + '_submission.csv')
with open(output_path, 'w', encoding='utf-8') as file:
writer = csv.writer(file)
for ans in tqdm(all_predictions):
writer.writerow([str(ans)])
print('Saved {} for submission'.format(output_path))
def process_jsonl(file_path): def process_jsonl(file_path):
...@@ -61,9 +70,7 @@ def parse_args(): ...@@ -61,9 +70,7 @@ def parse_args():
parser.add_argument('--json', parser.add_argument('--json',
default='your prediction file path', default='your prediction file path',
help='The results json path') help='The results json path')
parser.add_argument('--name', parser.add_argument('--exp-folder', help='The results json name')
default='alignment_bench',
help='The results json name')
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -75,4 +82,4 @@ if __name__ == '__main__': ...@@ -75,4 +82,4 @@ if __name__ == '__main__':
processed_data = process_jsonl(args.jsonl) processed_data = process_jsonl(args.jsonl)
save_as_json(processed_data) save_as_json(processed_data)
elif mode == 'csv': elif mode == 'csv':
extract_predictions_from_json(args.json, args.name) extract_predictions_from_json(args.exp_folder)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment