[Doc] Update Doc for Alignbench (#707)

* update alignmentbench * update alignmentbench * update doc * update * update

[Doc] Update Doc for Alignbench (#707)
* update alignmentbench * update alignmentbench * update doc * update * update
637628a7 · Songyang Zhang · GitHub · d7e7a637 · d7e7a637 · 637628a7
Unverified Commit 637628a7 authored Dec 15, 2023 by Songyang Zhang Committed by GitHub Dec 15, 2023
7 changed files
--- a/configs/models/openai/gpt_3.5_turbo.py
+++ b/configs/models/openai/gpt_3.5_turbo.py
-from opencompass.models import OpenAI
-
-models = [
-    dict(abbr='GPT-3.5-turbo',
-        type=OpenAI, path='gpt-3.5-turbo', key='sk-xxx',
-        max_out_len=2048, max_seq_len=2048, batch_size=1)
-]
--- a/configs/models/openai/gpt_3_5_turbo.py
+++ b/configs/models/openai/gpt_3_5_turbo.py
+from opencompass.models import OpenAI
+
+
+api_meta_template = dict(
+    round=[
+            dict(role='HUMAN', api_role='HUMAN'),
+            dict(role='BOT', api_role='BOT', generate=True),
+    ],
+)
+
+models = [
+    dict(abbr='GPT-3.5-turbo-0613',
+        type=OpenAI, path='gpt-3.5-turbo-0613',
+        key='ENV',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_out_len=2048, max_seq_len=4096, batch_size=8),
+]
--- a/configs/models/openai/gpt_4.py
+++ b/configs/models/openai/gpt_4.py
+from opencompass.models import OpenAI
+
+
+api_meta_template = dict(
+    round=[
+            dict(role='HUMAN', api_role='HUMAN'),
+            dict(role='BOT', api_role='BOT', generate=True),
+    ],
+)
+
+models = [
+    dict(abbr='GPT-3.5-turbo-0613',
+        type=OpenAI, path='gpt-3.5-turbo-0613',
+        key='ENV',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_out_len=2048, max_seq_len=4096, batch_size=8),
+]
--- a/docs/en/advanced_guides/subjective_evaluation.md
+++ b/docs/en/advanced_guides/subjective_evaluation.md
@@ -6,13 +6,25 @@ Subjective evaluation aims to assess the model's performance in tasks that align

 To explore the model's subjective capabilities, we employ JudgeLLM as a substitute for human assessors ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)).

-A popular evaluation method involves comparing model responses pairwise to calculate their win rate, another method involves calculate scores with single model response ([Chatbot Arena](https://chat.lmsys.org/)).
+A popular evaluation method involves
+
+- Compare Mode: comparing model responses pairwise to calculate their win rate
+- Score Mode: another method involves calculate scores with single model response ([Chatbot Arena](https://chat.lmsys.org/)).

 We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of models based on above methods.

-## Data Preparation
+## Subjective Evaluation with Custom Dataset
+
+The specific process includes:
+
+1. Data preparation
+2. Model response generation
+3. Evaluate the response with a JudgeLLM
+4. Generate JudgeLLM's response and calculate the metric
+
+### Step-1: Data Preparation

-We provide demo test set as below:
+We provide mini test-set for **Compare Mode** and **Score Mode** as below:

 ```python
 ###COREV2
@@ -44,62 +56,150 @@ The json must includes the following fields:

 If you want to modify prompt on each single question, you can full some other information into 'others' and construct it.

-## Evaluation Configuration
+### Step-2: Evaluation Configuration(Compare Mode)

-The specific process includes:
-
-1. Model response reasoning
-2. JudgeLLM evaluation comparisons
-3. Generating evaluation reports
-
-### Two Model Compare Configuration
-
-For `config/subjective_compare.py`, we provide some annotations to help users understand the configuration file's meaning.
+For `config/eval_subjective_compare.py`, we provide some annotations to help users understand the configuration file.

 ```python
-from mmengine.config import read_base
-with read_base():
-    from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets

+from mmengine.config import read_base
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
+
+from opencompass.partitioners import NaivePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import Corev2Summarizer

-datasets = [*subjective_datasets] #set dataset
-models = [...] #set models to be evaluated
-judge_model = [...] #set JudgeLLM
+with read_base():
+    # Pre-defined models
+    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
+    from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
+    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
+    from .models.openai.gpt_4 import models as gpt4_model
+    from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets

+# Evaluation datasets
+datasets = [*subjective_datasets]
+
+# Model to be evaluated
+models = [*hf_qwen_7b_chat, *hf_chatglm3_6b]
+
+# Inference configuration
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=SlurmSequentialRunner,
+        partition='llmeval',
+        quotatype='auto',
+        max_num_workers=256,
+        task=dict(type=OpenICLInferTask)),
+)
+# Evaluation configuration
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
-        mode='m2n',  #choose eval mode, in m2n mode，you need to set base_models and compare_models, it will generate the pairs between base_models and compare_models
-        base_models = [...],
-        compare_models = [...]
-    ))
-
-work_dir = 'Your work dir' #set your workdir, in this workdir, if you use '--reuse', it will reuse all existing results in this workdir automatically
+        mode='m2n', # m-model v.s n-model
+        # Under m2n setting
+        # must specify base_models and compare_models, program will generate pairs between base_models compare_models.
+        base_models = [*hf_qwen_14b_chat], # Baseline model
+        compare_models = [*hf_baichuan2_7b, *hf_chatglm3_6b] # model to be evaluated
+    ),
+    runner=dict(
+        type=SlurmSequentialRunner,
+        partition='llmeval',
+        quotatype='auto',
+        max_num_workers=256,
+        task=dict(
+            type=SubjectiveEvalTask,
+        judge_cfg=gpt4_model # Judge model
+        )),
+)
+work_dir = './outputs/subjective/'

 summarizer = dict(
-    type=Corev2Summarizer, #Your dataset Summarizer
-    match_method='smart', #Your answer extract method
+    type=Corev2Summarizer,  # Custom summarizer
+    match_method='smart', # Answer extraction
 )
 ```

-In addition, you can also change the response order of the two models, please refer to `config/subjective_compare.py`,
+In addition, you can also change the response order of the two models, please refer to `config/eval_subjective_compare.py`,
 when `infer_order` is setting to `random`, the response will be random ordered,
 when `infer_order` is setting to `double`, the response of two models will be doubled in two ways.

-### Single Model Scoring Configuration
+### Step-2: Evaluation Configuration(Score Mode)

-For `config/subjective_score.py`, it is mainly same with `config/subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.
+For `config/eval_subjective_score.py`, it is mainly same with `config/eval_subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.

-## Launching the Evaluation
+### Step-3: Launch the Evaluation

 ```shell
-python run.py config/subjective.py -r
+python run.py config/eval_subjective_score.py -r
 ```

 The `-r` parameter allows the reuse of model inference and GPT-4 evaluation results.

-## Evaluation Report
-
 The response of JudgeLLM will be output to `output/.../results/timestamp/xxmodel/xxdataset/.json`.
 The evaluation report will be output to `output/.../summary/timestamp/report.csv`.
+
+## Practice: AlignBench Evaluation
+
+### Dataset
+
+```bash
+mkdir -p ./data/subjective/
+
+cd ./data/subjective
+git clone https://github.com/THUDM/AlignBench.git
+
+# data format conversion
+python ../../../tools/convert_alignmentbench.py --mode json --jsonl data/data_release.jsonl
+
+```
+
+### Configuration
+
+Please edit the config `configs/eval_subjective_alignbench.py` according to your demand.
+
+### Evaluation
+
+```bash
+HF_EVALUATE_OFFLINE=1 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python run.py workspace/eval_subjective_alignbench.py
+```
+
+### Submit to Official Leaderboard(Optional)
+
+If you need to submit your prediction into official leaderboard, you can use `tools/convert_alignmentbench.py` for format conversion.
+
+- Make sure you have the following results
+
+```bash
+outputs/
+└── 20231214_173632
+    ├── configs
+    ├── logs
+    ├── predictions # model's response
+    ├── results
+    └── summary
+```
+
+- Convert the data
+
+```bash
+python tools/convert_alignmentbench.py --mode csv --exp-folder outputs/20231214_173632
+```
+
+- Get `.csv`  in `submission/` for submission
+
+```bash
+outputs/
+└── 20231214_173632
+    ├── configs
+    ├── logs
+    ├── predictions
+    ├── results
+    ├── submission # 可提交文件
+    └── summary
+```
--- a/docs/zh_cn/advanced_guides/subjective_evaluation.md
+++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md
@@ -6,16 +6,28 @@

 为了探究模型的主观能力，我们采用了JudgeLLM作为人类评估者的替代品（[LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)）。

-流行的评估方法主要有: 1.将模型的回答进行两两比较，以计算其胜率, 2.针对单模型的回答进行打分（[Chatbot Arena](https://chat.lmsys.org/)）。
+流行的评估方法主要有:
+
+- Compare模式：将模型的回答进行两两比较，以计算对战其胜率。
+- Score模式：针对单模型的回答进行打分（例如：[Chatbot Arena](https://chat.lmsys.org/)）。

 我们基于以上方法支持了JudgeLLM用于模型的主观能力评估（目前opencompass仓库里支持的所有模型都可以直接作为JudgeLLM进行调用，此外一些专用的JudgeLLM我们也在计划支持中）。

-## 数据准备
+## 自定义主观数据集评测
+
+主观评测的具体流程包括:
+
+1. 评测数据集准备
+2. 使用API模型或者开源模型进行问题答案的推理
+3. 使用选定的评价模型(JudgeLLM)对模型输出进行评估
+4. 对评价模型返回的预测结果进行解析并计算数值指标

-对于两回答比较和单回答打分两种方法，我们各提供了一个demo测试集如下：
+### 第一步：数据准备
+
+对于对战模式和打分模式，我们各提供了一个demo测试集如下：

 ```python
-###COREV2
+### 对战模式示例
 [
    {
        "question": "如果我在空中垂直抛球，球最初向哪个方向行进？",
@@ -27,7 +39,7 @@
        }
    },...]

-###CreationV0.1
+### 打分模式数据集示例
 [
    {
        "question": "请你扮演一个邮件管家，我让你给谁发送什么主题的邮件，你就帮我扩充好邮件正文，并打印在聊天框里。你需要根据我提供的邮件收件人以及邮件主题，来斟酌用词，并使用合适的敬语。现在请给导师发送邮件，询问他是否可以下周三下午15:00进行科研同步会，大约200字。",
@@ -44,62 +56,150 @@

 以上三个字段是必要的，用户也可以添加其他字段，如果需要对每个问题的prompt进行单独处理，可以在'others'字段中进行一些额外设置，并在Dataset类中添加相应的字段。

-## 评测配置
-
-具体流程包括:
+### 第二步：构建评测配置（对战模式）

-1. 模型回答的推理
-2. JudgeLLM评估
-3. 生成评测报告
-
-### 两回答比较配置
-
-对于两回答比较，更详细的config setting请参考 `config/subjective_compare.py`，下面我们提供了部分简略版的注释，方便用户理解配置文件的含义。
+对于两回答比较，更详细的config setting请参考 `config/eval_subjective_compare.py`，下面我们提供了部分简略版的注释，方便用户理解配置文件的含义。

 ```python
 from mmengine.config import read_base
-with read_base():
-    from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
-
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
+
+from opencompass.partitioners import NaivePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import Corev2Summarizer

-datasets = [*subjective_datasets] #指定需要评测的数据集
-models = [...] #指定需要评测的模型
-judge_model = [...] #指定JudgeLLM
+with read_base():
+    # 导入预设模型
+    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
+    from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
+    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
+    from .models.openai.gpt_4 import models as gpt4_model
+    from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets

+# 评测数据集
+datasets = [*subjective_datasets]
+
+# 待测模型列表
+models = [*hf_qwen_7b_chat, *hf_chatglm3_6b]
+
+# 推理配置
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=SlurmSequentialRunner,
+        partition='llmeval',
+        quotatype='auto',
+        max_num_workers=256,
+        task=dict(type=OpenICLInferTask)),
+)
+# 评测配置
 eval = dict(
    partitioner=dict(
        type=SubjectiveNaivePartitioner,
-        mode='m2n',  #选择评测模式，在m2n模式下，需要指定base_models和compare_models，将会对base_models和compare_models生成对应的两两pair（去重且不会与自身进行比较）
-        base_models = [...],
-        compare_models = [...]
-    ))
-
-work_dir = 'Your work dir' #指定工作目录，在此工作目录下，若使用--reuse参数启动评测，将自动复用该目录下已有的所有结果
+        mode='m2n', # m个模型 与 n个模型进行对战
+        #  在m2n模式下，需要指定base_models和compare_models，将会对base_models和compare_models生成对应的两两pair（去重且不会与自身进行比较）
+        base_models = [*hf_qwen_14b_chat], # 用于对比的基线模型
+        compare_models = [*hf_baichuan2_7b, *hf_chatglm3_6b] # 待评测模型
+    ),
+    runner=dict(
+        type=SlurmSequentialRunner,
+        partition='llmeval',
+        quotatype='auto',
+        max_num_workers=256,
+        task=dict(
+            type=SubjectiveEvalTask,
+        judge_cfg=gpt4_model # 评价模型
+        )),
+)
+work_dir = './outputs/subjective/' #指定工作目录，在此工作目录下，若使用--reuse参数启动评测，将自动复用该目录下已有的所有结果

 summarizer = dict(
-    type=Corev2Summarizer, #自定义数据集Summarizer
-    match_method='smart' #自定义答案提取方式
+    type=Corev2Summarizer,  #自定义数据集Summarizer
+    match_method='smart', #自定义答案提取方式
 )
 ```

-此外，在数据集的配置config中，还可以选择两回答比较时的回答顺序，请参考`config/subjective_compare.py`,
+此外，在数据集的配置config中，还可以选择两回答比较时的回答顺序，请参考`config/eval_subjective_compare.py`,
 当`infer_order`设置为`random`时，将对两模型的回复顺序进行随机打乱,
 当`infer_order`设置为`double`时，将把两模型的回复按两种先后顺序进行判断。

-### 单回答打分配置
+### 第二步：构建评测配置（打分模式）

-对于单回答打分，更详细的config setting请参考 `config/subjective_score.py`，该config的大部分都与两回答比较的config相同，只需要修改评测模式即可，将评测模式设置为`singlescore`。
+对于单回答打分，更详细的config setting请参考 `config/eval_subjective_score.py`，该config的大部分都与两回答比较的config相同，只需要修改评测模式即可，将评测模式设置为`singlescore`。

-## 启动评测
+### 第三步 启动评测并输出评测结果

 ```shell
-python run.py configs/subjective_score.py -r
+python run.py configs/eval_subjective_score.py -r
 ```

-`-r` 参数支持复用模型推理和评估结果。
-
-## 评测报告
+- `-r` 参数支持复用模型推理和评估结果。

 JudgeLLM的评测回复会保存在 `output/.../results/timestamp/xxmodel/xxdataset/.json`
 评测报告则会输出到 `output/.../summary/timestamp/report.csv`。
+
+## 实战：AlignBench 主观评测
+
+### 数据集准备
+
+```bash
+mkdir -p ./data/subjective/
+
+cd ./data/subjective
+git clone https://github.com/THUDM/AlignBench.git
+
+# data format conversion
+python ../../../tools/convert_alignmentbench.py --mode json --jsonl data/data_release.jsonl
+
+```
+
+### 配置文件
+
+请根据需要修改配置文件 `configs/eval_subjective_alignbench.py`
+
+### 启动评测
+
+按如下方式执行命令后，将会开始答案推理和主观打分，如只需进行推理，可以通过制定 `-m infer`实现
+
+```bash
+HF_EVALUATE_OFFLINE=1 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python run.py configs/eval_subjective_alignbench.py
+```
+
+### 提交官方评测（Optional）
+
+完成评测后，如需提交官方榜单进行评测，可以使用它`tools/convert_alignmentbench.py`进行格式转换。
+
+- 请确保已完成推理，并获得如下所示的文件:
+
+```bash
+outputs/
+└── 20231214_173632
+    ├── configs
+    ├── logs
+    ├── predictions # 模型回复
+    ├── results
+    └── summary
+```
+
+- 执行如下命令获得可用于提交的结果
+
+```bash
+python tools/convert_alignmentbench.py --mode csv --exp-folder outputs/20231214_173632
+```
+
+- 进入 `submission`文件夹获得可用于提交的`.csv`文件
+
+```bash
+outputs/
+└── 20231214_173632
+    ├── configs
+    ├── logs
+    ├── predictions
+    ├── results
+    ├── submission # 可提交文件
+    └── summary
+```
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -384,6 +384,7 @@ class OpenAIAllesAPIN(OpenAI):
                elif item['role'] == 'SYSTEM':
                    msg['role'] = 'system'
                messages.append(msg)
+
            # model can be response with user and system
            # when it comes with agent involved.
            assert msg['role'] in ['user', 'system']

--- a/tools/convert_alignmentbench.py
+++ b/tools/convert_alignmentbench.py
@@ -2,29 +2,38 @@ import argparse
 import csv
 import json
 import os
+from glob import glob

+from tqdm import tqdm

-def extract_predictions_from_json(input_path, file_name):
-    for root, dirs, files in os.walk(input_path):
-        for file in files:
-            if file == f'{file_name}.json':
-                file_path = os.path.join(root, file)
-                output_csv = os.path.join(root, f'{file_name}.csv')

-                with open(file_path, 'r', encoding='utf-8') as json_file:
-                    data = json.load(json_file)
-                    predictions = []
+def extract_predictions_from_json(input_folder):

-                    for key in data:
-                        prediction = data[key].get('prediction', '')
-                        predictions.append(prediction)
+    sub_folder = os.path.join(input_folder, 'submission')
+    pred_folder = os.path.join(input_folder, 'predictions')
+    if not os.path.exists(sub_folder):
+        os.makedirs(sub_folder)

-                with open(output_csv, 'w', newline='',
-                          encoding='utf-8') as csv_file:
-                    writer = csv.writer(csv_file)
+    for model_name in os.listdir(pred_folder):
+        model_folder = os.path.join(pred_folder, model_name)
+        json_paths = glob(os.path.join(model_folder, 'alignment_bench_*.json'))
+        # sorted by index
+        json_paths = sorted(
+            json_paths, key=lambda x: int(x.split('.json')[0].split('_')[-1]))
+        all_predictions = []
+        for json_ in json_paths:
+            json_data = json.load(open(json_))
+            for _, value in json_data.items():
+                prediction = value['prediction']
+                all_predictions.append(prediction)

-                    for prediction in predictions:
-                        writer.writerow([prediction])
+        # for prediction
+        output_path = os.path.join(sub_folder, model_name + '_submission.csv')
+        with open(output_path, 'w', encoding='utf-8') as file:
+            writer = csv.writer(file)
+            for ans in tqdm(all_predictions):
+                writer.writerow([str(ans)])
+        print('Saved {} for submission'.format(output_path))


 def process_jsonl(file_path):
@@ -61,9 +70,7 @@ def parse_args():
    parser.add_argument('--json',
                        default='your prediction file path',
                        help='The results json path')
-    parser.add_argument('--name',
-                        default='alignment_bench',
-                        help='The results json name')
+    parser.add_argument('--exp-folder', help='The results json name')
    args = parser.parse_args()
    return args

@@ -75,4 +82,4 @@ if __name__ == '__main__':
        processed_data = process_jsonl(args.jsonl)
        save_as_json(processed_data)
    elif mode == 'csv':
-        extract_predictions_from_json(args.json, args.name)
+        extract_predictions_from_json(args.exp_folder)