"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "2f43cff2b72e9a5bee26d31e4e8af3087a5618e1"
Unverified Commit 637628a7 authored by Songyang Zhang's avatar Songyang Zhang Committed by GitHub
Browse files

[Doc] Update Doc for Alignbench (#707)

* update alignmentbench

* update alignmentbench

* update doc

* update

* update
parent d7e7a637
from opencompass.models import OpenAI
models = [
dict(abbr='GPT-3.5-turbo',
type=OpenAI, path='gpt-3.5-turbo', key='sk-xxx',
max_out_len=2048, max_seq_len=2048, batch_size=1)
]
from opencompass.models import OpenAI
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
)
models = [
dict(abbr='GPT-3.5-turbo-0613',
type=OpenAI, path='gpt-3.5-turbo-0613',
key='ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048, max_seq_len=4096, batch_size=8),
]
from opencompass.models import OpenAI
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
)
models = [
dict(abbr='GPT-3.5-turbo-0613',
type=OpenAI, path='gpt-3.5-turbo-0613',
key='ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048, max_seq_len=4096, batch_size=8),
]
......@@ -6,13 +6,25 @@ Subjective evaluation aims to assess the model's performance in tasks that align
To explore the model's subjective capabilities, we employ JudgeLLM as a substitute for human assessors ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)).
A popular evaluation method involves comparing model responses pairwise to calculate their win rate, another method involves calculate scores with single model response ([Chatbot Arena](https://chat.lmsys.org/)).
A popular evaluation method involves
- Compare Mode: comparing model responses pairwise to calculate their win rate
- Score Mode: another method involves calculate scores with single model response ([Chatbot Arena](https://chat.lmsys.org/)).
We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of models based on above methods.
## Data Preparation
## Subjective Evaluation with Custom Dataset
The specific process includes:
1. Data preparation
2. Model response generation
3. Evaluate the response with a JudgeLLM
4. Generate JudgeLLM's response and calculate the metric
### Step-1: Data Preparation
We provide demo test set as below:
We provide mini test-set for **Compare Mode** and **Score Mode** as below:
```python
###COREV2
......@@ -44,62 +56,150 @@ The json must includes the following fields:
If you want to modify prompt on each single question, you can full some other information into 'others' and construct it.
## Evaluation Configuration
### Step-2: Evaluation Configuration(Compare Mode)
The specific process includes:
1. Model response reasoning
2. JudgeLLM evaluation comparisons
3. Generating evaluation reports
### Two Model Compare Configuration
For `config/subjective_compare.py`, we provide some annotations to help users understand the configuration file's meaning.
For `config/eval_subjective_compare.py`, we provide some annotations to help users understand the configuration file.
```python
from mmengine.config import read_base
with read_base():
from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import Corev2Summarizer
datasets = [*subjective_datasets] #set dataset
models = [...] #set models to be evaluated
judge_model = [...] #set JudgeLLM
with read_base():
# Pre-defined models
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
from .models.openai.gpt_4 import models as gpt4_model
from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
# Evaluation datasets
datasets = [*subjective_datasets]
# Model to be evaluated
models = [*hf_qwen_7b_chat, *hf_chatglm3_6b]
# Inference configuration
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
# Evaluation configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
mode='m2n', #choose eval mode, in m2n mode,you need to set base_models and compare_models, it will generate the pairs between base_models and compare_models
base_models = [...],
compare_models = [...]
))
work_dir = 'Your work dir' #set your workdir, in this workdir, if you use '--reuse', it will reuse all existing results in this workdir automatically
mode='m2n', # m-model v.s n-model
# Under m2n setting
# must specify base_models and compare_models, program will generate pairs between base_models compare_models.
base_models = [*hf_qwen_14b_chat], # Baseline model
compare_models = [*hf_baichuan2_7b, *hf_chatglm3_6b] # model to be evaluated
),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(
type=SubjectiveEvalTask,
judge_cfg=gpt4_model # Judge model
)),
)
work_dir = './outputs/subjective/'
summarizer = dict(
type=Corev2Summarizer, #Your dataset Summarizer
match_method='smart', #Your answer extract method
type=Corev2Summarizer, # Custom summarizer
match_method='smart', # Answer extraction
)
```
In addition, you can also change the response order of the two models, please refer to `config/subjective_compare.py`,
In addition, you can also change the response order of the two models, please refer to `config/eval_subjective_compare.py`,
when `infer_order` is setting to `random`, the response will be random ordered,
when `infer_order` is setting to `double`, the response of two models will be doubled in two ways.
### Single Model Scoring Configuration
### Step-2: Evaluation Configuration(Score Mode)
For `config/subjective_score.py`, it is mainly same with `config/subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.
For `config/eval_subjective_score.py`, it is mainly same with `config/eval_subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.
## Launching the Evaluation
### Step-3: Launch the Evaluation
```shell
python run.py config/subjective.py -r
python run.py config/eval_subjective_score.py -r
```
The `-r` parameter allows the reuse of model inference and GPT-4 evaluation results.
## Evaluation Report
The response of JudgeLLM will be output to `output/.../results/timestamp/xxmodel/xxdataset/.json`.
The evaluation report will be output to `output/.../summary/timestamp/report.csv`.
## Practice: AlignBench Evaluation
### Dataset
```bash
mkdir -p ./data/subjective/
cd ./data/subjective
git clone https://github.com/THUDM/AlignBench.git
# data format conversion
python ../../../tools/convert_alignmentbench.py --mode json --jsonl data/data_release.jsonl
```
### Configuration
Please edit the config `configs/eval_subjective_alignbench.py` according to your demand.
### Evaluation
```bash
HF_EVALUATE_OFFLINE=1 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python run.py workspace/eval_subjective_alignbench.py
```
### Submit to Official Leaderboard(Optional)
If you need to submit your prediction into official leaderboard, you can use `tools/convert_alignmentbench.py` for format conversion.
- Make sure you have the following results
```bash
outputs/
└── 20231214_173632
├── configs
├── logs
├── predictions # model's response
├── results
└── summary
```
- Convert the data
```bash
python tools/convert_alignmentbench.py --mode csv --exp-folder outputs/20231214_173632
```
- Get `.csv` in `submission/` for submission
```bash
outputs/
└── 20231214_173632
├── configs
├── logs
├── predictions
├── results
├── submission # 可提交文件
└── summary
```
......@@ -6,16 +6,28 @@
为了探究模型的主观能力,我们采用了JudgeLLM作为人类评估者的替代品([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685))。
流行的评估方法主要有: 1.将模型的回答进行两两比较,以计算其胜率, 2.针对单模型的回答进行打分([Chatbot Arena](https://chat.lmsys.org/))。
流行的评估方法主要有:
- Compare模式:将模型的回答进行两两比较,以计算对战其胜率。
- Score模式:针对单模型的回答进行打分(例如:[Chatbot Arena](https://chat.lmsys.org/))。
我们基于以上方法支持了JudgeLLM用于模型的主观能力评估(目前opencompass仓库里支持的所有模型都可以直接作为JudgeLLM进行调用,此外一些专用的JudgeLLM我们也在计划支持中)。
## 数据准备
## 自定义主观数据集评测
主观评测的具体流程包括:
1. 评测数据集准备
2. 使用API模型或者开源模型进行问题答案的推理
3. 使用选定的评价模型(JudgeLLM)对模型输出进行评估
4. 对评价模型返回的预测结果进行解析并计算数值指标
对于两回答比较和单回答打分两种方法,我们各提供了一个demo测试集如下:
### 第一步:数据准备
对于对战模式和打分模式,我们各提供了一个demo测试集如下:
```python
###COREV2
### 对战模式示例
[
{
"question": "如果我在空中垂直抛球,球最初向哪个方向行进?",
......@@ -27,7 +39,7 @@
}
},...]
###CreationV0.1
### 打分模式数据集示例
[
{
"question": "请你扮演一个邮件管家,我让你给谁发送什么主题的邮件,你就帮我扩充好邮件正文,并打印在聊天框里。你需要根据我提供的邮件收件人以及邮件主题,来斟酌用词,并使用合适的敬语。现在请给导师发送邮件,询问他是否可以下周三下午15:00进行科研同步会,大约200字。",
......@@ -44,62 +56,150 @@
以上三个字段是必要的,用户也可以添加其他字段,如果需要对每个问题的prompt进行单独处理,可以在'others'字段中进行一些额外设置,并在Dataset类中添加相应的字段。
## 评测配置
具体流程包括:
### 第二步:构建评测配置(对战模式)
1. 模型回答的推理
2. JudgeLLM评估
3. 生成评测报告
### 两回答比较配置
对于两回答比较,更详细的config setting请参考 `config/subjective_compare.py`,下面我们提供了部分简略版的注释,方便用户理解配置文件的含义。
对于两回答比较,更详细的config setting请参考 `config/eval_subjective_compare.py`,下面我们提供了部分简略版的注释,方便用户理解配置文件的含义。
```python
from mmengine.config import read_base
with read_base():
from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import Corev2Summarizer
datasets = [*subjective_datasets] #指定需要评测的数据集
models = [...] #指定需要评测的模型
judge_model = [...] #指定JudgeLLM
with read_base():
# 导入预设模型
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
from .models.openai.gpt_4 import models as gpt4_model
from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
# 评测数据集
datasets = [*subjective_datasets]
# 待测模型列表
models = [*hf_qwen_7b_chat, *hf_chatglm3_6b]
# 推理配置
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
# 评测配置
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
mode='m2n', #选择评测模式,在m2n模式下,需要指定base_models和compare_models,将会对base_models和compare_models生成对应的两两pair(去重且不会与自身进行比较)
base_models = [...],
compare_models = [...]
))
work_dir = 'Your work dir' #指定工作目录,在此工作目录下,若使用--reuse参数启动评测,将自动复用该目录下已有的所有结果
mode='m2n', # m个模型 与 n个模型进行对战
# 在m2n模式下,需要指定base_models和compare_models,将会对base_models和compare_models生成对应的两两pair(去重且不会与自身进行比较)
base_models = [*hf_qwen_14b_chat], # 用于对比的基线模型
compare_models = [*hf_baichuan2_7b, *hf_chatglm3_6b] # 待评测模型
),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(
type=SubjectiveEvalTask,
judge_cfg=gpt4_model # 评价模型
)),
)
work_dir = './outputs/subjective/' #指定工作目录,在此工作目录下,若使用--reuse参数启动评测,将自动复用该目录下已有的所有结果
summarizer = dict(
type=Corev2Summarizer, #自定义数据集Summarizer
match_method='smart' #自定义答案提取方式
type=Corev2Summarizer, #自定义数据集Summarizer
match_method='smart', #自定义答案提取方式
)
```
此外,在数据集的配置config中,还可以选择两回答比较时的回答顺序,请参考`config/subjective_compare.py`,
此外,在数据集的配置config中,还可以选择两回答比较时的回答顺序,请参考`config/eval_subjective_compare.py`,
`infer_order`设置为`random`时,将对两模型的回复顺序进行随机打乱,
`infer_order`设置为`double`时,将把两模型的回复按两种先后顺序进行判断。
### 单回答打分配置
### 第二步:构建评测配置(打分模式)
对于单回答打分,更详细的config setting请参考 `config/subjective_score.py`,该config的大部分都与两回答比较的config相同,只需要修改评测模式即可,将评测模式设置为`singlescore`
对于单回答打分,更详细的config setting请参考 `config/eval_subjective_score.py`,该config的大部分都与两回答比较的config相同,只需要修改评测模式即可,将评测模式设置为`singlescore`
## 启动评测
### 第三步 启动评测并输出评测结果
```shell
python run.py configs/subjective_score.py -r
python run.py configs/eval_subjective_score.py -r
```
`-r` 参数支持复用模型推理和评估结果。
## 评测报告
- `-r` 参数支持复用模型推理和评估结果。
JudgeLLM的评测回复会保存在 `output/.../results/timestamp/xxmodel/xxdataset/.json`
评测报告则会输出到 `output/.../summary/timestamp/report.csv`
## 实战:AlignBench 主观评测
### 数据集准备
```bash
mkdir -p ./data/subjective/
cd ./data/subjective
git clone https://github.com/THUDM/AlignBench.git
# data format conversion
python ../../../tools/convert_alignmentbench.py --mode json --jsonl data/data_release.jsonl
```
### 配置文件
请根据需要修改配置文件 `configs/eval_subjective_alignbench.py`
### 启动评测
按如下方式执行命令后,将会开始答案推理和主观打分,如只需进行推理,可以通过制定 `-m infer`实现
```bash
HF_EVALUATE_OFFLINE=1 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python run.py configs/eval_subjective_alignbench.py
```
### 提交官方评测(Optional)
完成评测后,如需提交官方榜单进行评测,可以使用它`tools/convert_alignmentbench.py`进行格式转换。
- 请确保已完成推理,并获得如下所示的文件:
```bash
outputs/
└── 20231214_173632
├── configs
├── logs
├── predictions # 模型回复
├── results
└── summary
```
- 执行如下命令获得可用于提交的结果
```bash
python tools/convert_alignmentbench.py --mode csv --exp-folder outputs/20231214_173632
```
- 进入 `submission`文件夹获得可用于提交的`.csv`文件
```bash
outputs/
└── 20231214_173632
├── configs
├── logs
├── predictions
├── results
├── submission # 可提交文件
└── summary
```
......@@ -384,6 +384,7 @@ class OpenAIAllesAPIN(OpenAI):
elif item['role'] == 'SYSTEM':
msg['role'] = 'system'
messages.append(msg)
# model can be response with user and system
# when it comes with agent involved.
assert msg['role'] in ['user', 'system']
......
......@@ -2,29 +2,38 @@ import argparse
import csv
import json
import os
from glob import glob
from tqdm import tqdm
def extract_predictions_from_json(input_path, file_name):
for root, dirs, files in os.walk(input_path):
for file in files:
if file == f'{file_name}.json':
file_path = os.path.join(root, file)
output_csv = os.path.join(root, f'{file_name}.csv')
with open(file_path, 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
predictions = []
def extract_predictions_from_json(input_folder):
for key in data:
prediction = data[key].get('prediction', '')
predictions.append(prediction)
sub_folder = os.path.join(input_folder, 'submission')
pred_folder = os.path.join(input_folder, 'predictions')
if not os.path.exists(sub_folder):
os.makedirs(sub_folder)
with open(output_csv, 'w', newline='',
encoding='utf-8') as csv_file:
writer = csv.writer(csv_file)
for model_name in os.listdir(pred_folder):
model_folder = os.path.join(pred_folder, model_name)
json_paths = glob(os.path.join(model_folder, 'alignment_bench_*.json'))
# sorted by index
json_paths = sorted(
json_paths, key=lambda x: int(x.split('.json')[0].split('_')[-1]))
all_predictions = []
for json_ in json_paths:
json_data = json.load(open(json_))
for _, value in json_data.items():
prediction = value['prediction']
all_predictions.append(prediction)
for prediction in predictions:
writer.writerow([prediction])
# for prediction
output_path = os.path.join(sub_folder, model_name + '_submission.csv')
with open(output_path, 'w', encoding='utf-8') as file:
writer = csv.writer(file)
for ans in tqdm(all_predictions):
writer.writerow([str(ans)])
print('Saved {} for submission'.format(output_path))
def process_jsonl(file_path):
......@@ -61,9 +70,7 @@ def parse_args():
parser.add_argument('--json',
default='your prediction file path',
help='The results json path')
parser.add_argument('--name',
default='alignment_bench',
help='The results json name')
parser.add_argument('--exp-folder', help='The results json name')
args = parser.parse_args()
return args
......@@ -75,4 +82,4 @@ if __name__ == '__main__':
processed_data = process_jsonl(args.jsonl)
save_as_json(processed_data)
elif mode == 'csv':
extract_predictions_from_json(args.json, args.name)
extract_predictions_from_json(args.exp_folder)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment