[evaluation] add automatic evaluation pipeline (#3821)

* add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>

[evaluation] add automatic evaluation pipeline (#3821)
* add functions for gpt evaluation * add automatic eval Update eval.py * using jload and modify the type of answers1 and answers2 * Update eval.py Update eval.py * Update evaluator.py * support gpt evaluation * update readme.md update README.md update READNE.md modify readme.md * add Chinese example for config, battle prompt and evaluation prompt file * remove GPT-4 config * remove sample folder --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com> Co-authored-by: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
34966378 · Yuanchen · GitHub · 05b8a8de · 34966378 · 34966378
Unverified Commit 34966378 authored May 24, 2023 by Yuanchen Committed by GitHub May 24, 2023
19 changed files
--- a/applications/Chat/evaluate/README.md
+++ b/applications/Chat/evaluate/README.md
--- a/applications/Chat/evaluate/config/config_cn.json
+++ b/applications/Chat/evaluate/config/config_cn.json
+{
+  "language": "cn",
+  "category": {
+    "brainstorming": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "creativity",
+        "practicality",
+        "correctness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "chat": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "naturalness",
+        "engagingness",
+        "reasonableness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "classification": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Precision",
+        "Recall",
+        "F1 score"
+      ]
+    },
+    "closed_qa": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "extraction": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Precision",
+        "Recall",
+        "F1 score"
+      ]
+    },
+    "generation": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "diversity"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "open_qa": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "rewriting": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "roleplay": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "fidelity",
+        "creativity"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "summarization": {
+      "GPT-3.5": [
+        "language organization",
+        "relevance",
+        "correctness",
+        "conciseness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    }
+  }
+}
--- a/applications/Chat/evaluate/eval.py
+++ b/applications/Chat/evaluate/eval.py
+import argparse
+import json
+import os
+import openai
+from evaluator import Evaluator
+from utils import jload
+def main(args):
+    assert len(args.answer_file_list) == len(
+        args.model_name_list), "The number of answer files and model names should be equal!"
+    # load config
+    config = jload(args.config_file)
+    if config["language"] == "cn":
+        # get metric settings for all categories
+        metrics_per_category = {}
+        for category in config["category"].keys():
+            metrics_all = {}
+            for metric_type, metrics in config["category"][category].items():
+                metrics_all[metric_type] = metrics
+            metrics_per_category[category] = metrics_all
+        battle_prompt = None
+        if args.battle_prompt_file:
+            battle_prompt = jload(args.battle_prompt_file)
+        gpt_evaluation_prompt = None
+        if args.gpt_evaluation_prompt_file:
+            gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)
+        if len(args.model_name_list) == 2 and not battle_prompt:
+            raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")
+        if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
+            raise Exception(
+                "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
+        # initialize evaluator
+        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
+        if len(args.model_name_list) == 2:
+            answers1 = jload(args.answer_file_list[0])
+            answers2 = jload(args.answer_file_list[1])
+            assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"
+            evaluator.battle(answers1=answers1, answers2=answers2)
+            evaluator.save(args.save_path, args.model_name_list)
+        elif len(args.model_name_list) == 1:
+            targets = jload(args.target_file)
+            answers = jload(args.answer_file_list[0])
+            assert len(targets) == len(answers), "The number of target answers and model answers should be equal!"
+            evaluator.evaluate(answers=answers, targets=targets)
+            evaluator.save(args.save_path, args.model_name_list)
+        else:
+            raise ValueError("Unsupported number of answer files and model names!")
+    else:
+        raise ValueError(f'Unsupported language {config["language"]}!')
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
+    parser.add_argument('--config_file',
+                        type=str,
+                        default=None,
+                        required=True,
+                        help='path to the file of target results')
+    parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
+    parser.add_argument('--gpt_evaluation_prompt_file',
+                        type=str,
+                        default=None,
+                        help='path to the prompt file for gpt evaluation')
+    parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
+    parser.add_argument('--answer_file_list',
+                        type=str,
+                        nargs='+',
+                        default=[],
+                        required=True,
+                        help='path to the answer files of at most 2 models')
+    parser.add_argument('--model_name_list',
+                        type=str,
+                        nargs='+',
+                        default=[],
+                        required=True,
+                        help='the names of at most 2 models')
+    parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
+    parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
+    args = parser.parse_args()
+    if args.openai_key is not None:
+        os.environ["OPENAI_API_KEY"] = args.openai_key
+    openai.api_key = os.getenv("OPENAI_API_KEY")
+    main(args)
--- a/applications/Chat/evaluate/eval.sh
+++ b/applications/Chat/evaluate/eval.sh
+python eval.py \
+    --config_file "path to the config file" \
+    --battle_prompt_file "path to the prompt file for battle" \
+    --gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
+    --target_file "path to the target answer file" \
+    --answer_file_list "path to the answer files of at most 2 models" \
+    --model_name_list "the names of at most 2 models" \
+    --save_path "path to save results" \
+    --openai_key "your openai key" \
--- a/applications/Chat/evaluate/evaluate.py
+++ b/applications/Chat/evaluate/evaluate.py
-#    Adapted form https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py
-#    Copyright 2023 LM-SYS@FastChat
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#        http://www.apache.org/licenses/LICENSE-2.0
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-import argparse
-import json
-import os
-import time
-import re
-import concurrent.futures
-import openai
-import tqdm
-import shortuuid
-import logging
-from utils import jload, jdump, get_json_list
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-MAX_API_RETRY = 3
-def get_eval(sys_prompt, user_prompt: str, answer_id: int, max_tokens: int, model: str):
-    logging.basicConfig(level=logging.INFO)
-    for _ in range(MAX_API_RETRY):
-        try:
-            response = openai.ChatCompletion.create(
-                model=model,
-                messages=[{
-                    'role': 'system',
-                    'content': sys_prompt
-                }, {
-                    'role': 'user',
-                    'content': user_prompt,
-                }],
-                temperature=0.2,
-                max_tokens=max_tokens,
-            )
-            review = response['choices'][0]['message']['content']
-            return {"review": review, 'id': answer_id}
-        except Exception as e:
-            logger.error(e)
-            time.sleep(1)
-    logger.error(f' Review {answer_id} failed after {MAX_API_RETRY} retries.')
-    return 'error'
-def parse_score(review):
-    try:
-        pattern = re.compile('([0-9]|10) out of 10')
-        sp = re.findall(pattern, review)
-        if len(re.findall(pattern, review)) == 2:
-            return [float(sp[0]), float(sp[1])]
-        pattern = re.compile('a score of ([0-9]|10)')
-        sp = re.findall(pattern, review)
-        if len(re.findall(pattern, review)) == 2:
-            return [float(sp[0]), float(sp[1])]
-        pattern = re.compile('([0-9]|10)/10')
-        sp = re.findall(pattern, review)
-        if len(re.findall(pattern, review)) == 2:
-            return [float(sp[0]), float(sp[1])]
-        score_pair = review.split('\n')[0]
-        score_pair = score_pair.replace(',', ' ')
-        sp = score_pair.split(' ')
-        if len(sp) == 2:
-            return [float(sp[0]), float(sp[1])]
-        else:
-            raise Exception('Invalid score pair.')
-    except Exception as e:
-        return [-1, -1]
-def gen_prompt(reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2):
-    reviewer_idx = 0
-    for idx, reviewer in enumerate(reviewer_jsons):
-        if reviewer['category'] == cat:
-            reviewer_idx = idx
-            break
-    prompt_id = reviewer_jsons[reviewer_idx]['prompt_id']
-    prompt_json = prompt_jsons[prompt_id-1]
-    assert prompt_json['prompt_id'] == prompt_id
-    sys_prompt = prompt_json['system_prompt']
-    prompt_template = prompt_json['prompt_template']
-    defaults = prompt_json['defaults']
-    prompt = prompt_template.format(
-        question=ques, answer_1=ans1, answer_2=ans2, **defaults)
-    return sys_prompt, prompt, reviewer_idx+1
-def evaluate(args):
-    answer1_jsons = jload(args.answer_file_list[0])
-    answer2_jsons = jload(args.answer_file_list[1])
-    reviewer_jsons = get_json_list(args.reviewer_file)
-    prompt_jsons = get_json_list(args.prompt_file)
-    assert len(answer1_jsons) == len(answer2_jsons)
-    handles = []
-    review_jsons = []
-    total_len = len(answer1_jsons)
-    question_idx_list = list(range(total_len))
-    logger.info(
-        f' Total number of answers: {len(answer2_jsons)}.')
-    reviews = []
-    with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor:
-        futures = []
-        for i in question_idx_list:
-            assert answer1_jsons[i]['id'] == answer2_jsons[i]['id']
-            answer_id = answer1_jsons[i]['id']
-            ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instruction'] + \
-                " " + answer1_jsons[i]['input']
-            cat = answer1_jsons[i]['category']
-            ans1 = answer1_jsons[i]['output']
-            ans2 = answer2_jsons[i]['output']
-            sys_prompt, prompt, reviewer_id = gen_prompt(
-                reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2)
-            review_id = shortuuid.uuid()
-            review_jsons.append({
-                'review_id': review_id,
-                'id': answer_id,
-                'reviewer_id': reviewer_id,
-                'metadata': {}
-            })
-            future = executor.submit(
-                get_eval, sys_prompt, prompt, answer_id, args.max_tokens, args.model)
-            futures.append(future)
-        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
-            reviews.append(future.result())
-    reviews.sort(key=lambda x: x['id'])
-    review_jsons.sort(key=lambda x: x['id'])
-    ans1_score = 0
-    ans2_score = 0
-    better_count = 0
-    worse_count = 0
-    tie_count = 0
-    invalid_count = 0
-    better_file = []
-    worse_file = []
-    tie_file = []
-    invalid_file = []
-    output_review_file = []
-    for idx, review in enumerate(reviews):
-        scores = parse_score(review['review'])
-        review_jsons[idx]['review'] = review['review']
-        review_jsons[idx]['score'] = scores
-        if scores[0] == -1 and scores[1] == -1:
-            invalid_count += 1
-            invalid_file.append(review_jsons[idx])
-            logger.info(f' Invalid score pair: {review_jsons[idx]["id"]}.')
-        else:
-            if scores[0] > scores[1]:
-                worse_count += 1
-                worse_file.append(review_jsons[idx])
-            elif scores[0] < scores[1]:
-                better_count += 1
-                better_file.append(review_jsons[idx])
-            else:
-                tie_count += 1
-                tie_file.append(review_jsons[idx])
-            ans1_score += scores[0]
-            ans2_score += scores[1]
-        output_review_file.append(review_jsons[idx])
-    better_file.sort(key=lambda x: x['id'])
-    worse_file.sort(key=lambda x: x['id'])
-    tie_file.sort(key=lambda x: x['id'])
-    invalid_file.sort(key=lambda x: x['id'])
-    output_review_file.sort(key=lambda x: x['id'])
-    name1 = os.path.basename(args.answer_file_list[0]).split("_answers")[0]
-    name2 = os.path.basename(args.answer_file_list[1]).split("_answers")[0]
-    prefix = f"{name1}_vs_{name2}"
-    jdump(better_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_better.json"))
-    jdump(worse_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_worse.json"))
-    jdump(tie_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_tie.json"))
-    jdump(invalid_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_invalid.json"))
-    jdump(output_review_file, os.path.join(
-        args.output_folder, prefix, f"{prefix}_review.json"))
-    if os.path.exists(os.path.join(args.output_folder, "results.json")):
-        results = jload(os.path.join(args.output_folder, "results.json"))
-    else:
-        results = {}
-    results[prefix] = {'model': [name1, name2], 'better': better_count, 'worse': worse_count, 'tie': tie_count, 'win_rate': better_count /
-                       (len(reviews)-invalid_count), 'score': [ans1_score/(len(reviews)-invalid_count), ans2_score/(len(reviews)-invalid_count)]}
-    jdump(results, os.path.join(args.output_folder, "results.json"))
-    logger.info(f' Total {invalid_count} invalid score pair(s).')
-    logger.info(f' Model {name2} has {better_count} better answer(s).')
-    logger.info(f' Model {name2} has {worse_count} worse answer(s).')
-    logger.info(f' {tie_count} answer(s) play(s) to a tie.')
-    logger.info(
-        f' Win rate of model {name2}: {better_count/(len(reviews)-invalid_count):.2f}')
-    logger.info(
-        f' Model {name1} average score: {ans1_score/(len(reviews)-invalid_count):.2f}')
-    logger.info(
-        f' Model {name2} average score: {ans2_score/(len(reviews)-invalid_count):.2f}')
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Model evaluation.')
-    parser.add_argument('--answer_file_list', nargs='+', default=[])
-    parser.add_argument('--prompt_file')
-    parser.add_argument('--reviewer_file')
-    parser.add_argument('--output_folder', type=str, default="./output")
-    parser.add_argument('--openai_key', type=str, default=None)
-    parser.add_argument('--model', type=str, default="gpt-4")
-    parser.add_argument('--num_workers', type=int, default=8)
-    parser.add_argument('--max_tokens', type=int, default=512,
-                        help='maximum number of tokens produced in the output')
-    args = parser.parse_args()
-    if args.openai_key is not None:
-        os.environ["OPENAI_API_KEY"] = args.openai_key
-    openai.api_key = os.getenv("OPENAI_API_KEY")
-    evaluate(args)
--- a/applications/Chat/evaluate/evaluate.sh
+++ b/applications/Chat/evaluate/evaluate.sh
-python evaluate.py \
-    --answer_file_list "path to answers of model 1" "path to answers of model 2" \
-    --prompt_file "path to prompt file" \
-    --reviewer_file "path to reviewer file" \
-    --output_folder "path to output folder" \
-    --openai_key "your openai key" \
-    --model "gpt-4" \
-    --num_workers 8 \
-    --max_tokens 512 \
--- a/applications/Chat/evaluate/evaluator.py
+++ b/applications/Chat/evaluate/evaluator.py
+import os
+from typing import Any, Dict, List
+import gpt_evaluate
+import metrics
+import pandas as pd
+from utils import get_data_per_category, jdump
+class Evaluator(object):
+    """
+        A class named Evaluator includes GPT-3.5/GPT-4 evaluation
+        and automatic evaluation
+    """
+    def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str,
+                                                                                                          Any]) -> None:
+        self.params = params
+        self.battle_prompt = battle_prompt
+        self.gpt_evaluation_prompt = gpt_evaluation_prompt
+        self.automatic_metric_stats = dict()
+        self.gpt35_evaluation_results = dict()
+        self.battle_results = []
+    def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
+        """
+        Comparison between two models using GPT-4 as the reviewer.
+        """
+        self.battle_results = gpt_evaluate.battle(answers1, answers2, self.battle_prompt)
+    def evaluate(self, answers: List[Dict], targets: List[Dict]) -> None:
+        """
+        A comprehensive evaluation of the answers from the model.
+        The function evaluates the model's performance from different perspectives
+        using GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
+        The metrics will be decided by the config file.
+        """
+        def switch(metric):
+            if metric == "BLEU":
+                return metrics.bleu_score(preds=predicts_list, targets=targets_list)
+            elif metric == "ROUGE":
+                return metrics.rouge_cn_score(preds=predicts_list, targets=targets_list)
+            elif (metric == "Distinct"):
+                return metrics.distinct_score(preds=predicts_list)
+            elif (metric == "BERTScore"):
+                return metrics.bert_score(preds=predicts_list, targets=targets_list)
+            elif (metric == "Precision"):
+                return metrics.precision(preds=predicts_list, targets=targets_list)
+            elif (metric == "Recall"):
+                return metrics.recall(preds=predicts_list, targets=targets_list)
+            elif (metric == "F1 score"):
+                return metrics.F1_score(preds=predicts_list, targets=targets_list)
+            else:
+                raise ValueError(f"Unexpected metric")
+        answers_per_category = get_data_per_category(answers, list(self.params.keys()))
+        targets_per_category = get_data_per_category(targets, list(self.params.keys()))
+        # automatic evaluation
+        for category in self.params:
+            category_metrics = self.params[category]["Metrics"]
+            self.automatic_metric_stats[category] = {}
+            targets_list = [
+                target["target"] if target["target"] else target["output"] for target in targets_per_category[category]
+            ]
+            predicts_list = [answer["output"] for answer in answers_per_category[category]]
+            for metric in category_metrics:
+                self.automatic_metric_stats[category].update(switch(metric=metric))
+        # gpt35 evaluation
+        for category in self.params:
+            category_metrics = self.params[category]["GPT-3.5"]
+            prompt = self.gpt_evaluation_prompt.get(category, None)
+            if prompt is None:
+                print(f"No prompt for category {category}! Use prompt for category general now.")
+                prompt = self.gpt_evaluation_prompt["general"]
+            self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category],
+                                                                                  prompt, category_metrics, category)
+    def save(self, path: str, model_name_list: List[str]) -> None:
+        """
+        Save evaluation results of GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
+        """
+        if len(model_name_list) == 2:
+            save_path = os.path.join(path, "gpt_evaluate", "battle_results")
+            gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
+        else:
+            # save evaluation results for automatic metrics
+            automatic_df = pd.DataFrame(self.automatic_metric_stats)
+            automatic_results_save_path = os.path.join(path, "automatic_results")
+            if not os.path.exists(automatic_results_save_path):
+                os.makedirs(automatic_results_save_path)
+            automatic_df.to_csv(os.path.join(automatic_results_save_path, f"{model_name_list[0]}.csv"), index=True)
+            # Save evaluation results for GPT-3.5 evaluation metrics.
+            all_evaluations = []
+            base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results")
+            evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
+            for category, evaluations in self.gpt35_evaluation_results.items():
+                jdump(
+                    evaluations,
+                    os.path.join(evaluation_results_save_path, model_name_list[0],
+                                 f"{category}_evaluation_results.json"))
+                all_evaluations.extend(evaluations)
+            jdump(all_evaluations,
+                  os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json"))
+            # Start to calculate scores and save statictics.
+            evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
+            gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations,
+                                                          evaluation_statistics_save_path)
+            # Save charts and csv.
+            evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
+            gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path,
+                                                             evaluation_analyses_save_path)
--- a/applications/Chat/evaluate/generate_answers.py
+++ b/applications/Chat/evaluate/generate_answers.py
-import argparse
-import os
-import random
-import copy
-import math
-from tqdm import tqdm
-import torch
-import torch.distributed as dist
-import transformers
-from coati.models.bloom import BLOOMActor
-from coati.models.gpt import GPTActor
-from coati.models.opt import OPTActor
-from coati.models.roberta import RoBERTaActor
-from coati.models.llama import LlamaActor
-from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
-from transformers import AutoTokenizer, RobertaTokenizer
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-from colossalai.logging import get_dist_logger
-from utils import jload, jdump, is_rank_0
-logger = get_dist_logger()
-PROMPT_DICT = {
-    "prompt_input":
-        ("Below is an instruction that describes a task, paired with an input that provides further context. "
-         "Write a response that appropriately completes the request.\n\n"
-         "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"),
-    "prompt_no_input": ("Below is an instruction that describes a task. "
-                        "Write a response that appropriately completes the request.\n\n"
-                        "### Instruction:\n{instruction}\n\n### Response:"),
-}
-def generate(args):
-    # torch.cuda.set_per_process_memory_fraction(0.4)
-    if args.strategy == 'naive':
-        strategy = NaiveStrategy()
-    elif args.strategy == 'ddp':
-        strategy = DDPStrategy()
-    elif args.strategy == 'colossalai_gemini':
-        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
-    elif args.strategy == 'colossalai_zero2':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
-    elif args.strategy == 'colossalai_zero2_cpu':
-        strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-    world_size = dist.get_world_size()
-    rank = dist.get_rank()
-    with strategy.model_init_context():
-        if args.model == 'gpt2':
-            actor = GPTActor(pretrained=args.model_path).to(
-                torch.cuda.current_device())
-        elif args.model == 'bloom':
-            actor = BLOOMActor(pretrained=args.model_path).to(
-                torch.cuda.current_device())
-        elif args.model == 'opt':
-            actor = OPTActor(pretrained=args.model_path).to(
-                torch.cuda.current_device())
-        elif args.model == 'roberta':
-            actor = RoBERTaActor(pretrained=args.model_path).to(
-                torch.cuda.current_device())
-        elif args.model == 'llama':
-            actor = LlamaActor(pretrained=args.model_path).to(
-                torch.float16).to(torch.cuda.current_device())
-        else:
-            raise ValueError(f'Unsupported model "{args.model}"')
-    if args.model == 'gpt2':
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'bloom':
-        tokenizer = AutoTokenizer.from_pretrained('bigscience/bloom-560m')
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == 'opt':
-        tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
-    elif args.model == 'roberta':
-        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
-    elif args.model == 'llama':
-        tokenizer = AutoTokenizer.from_pretrained(args.model_path,
-                                                  padding_side="right",
-                                                  use_fast=False,
-                                                  )
-        tokenizer.eos_token = '<\s>'
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-    questions = []
-    if args.max_datasets_size is not None:
-        questions = random.sample(jload(args.dataset), args.max_datasets_size)
-        if is_rank_0():
-            logger.info(
-                f"Limiting dataset to {args.max_datasets_size} examples.")
-        questions = questions[rank:args.max_datasets_size:world_size]
-    answers = copy.deepcopy(questions)
-    prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
-    sources = [
-        prompt_input.format_map(example) if example.get(
-            "input", "") != "" else prompt_no_input.format_map(example)
-        for example in questions
-    ]
-    if is_rank_0():
-        logger.info("Tokenizing inputs... This may take some time...")
-    input_ids_list = []
-    for string in sources:
-        input_ids = tokenizer.encode(string, return_tensors='pt').squeeze(0)
-        input_ids_list.append(input_ids)
-    bar = tqdm(range(math.ceil(len(input_ids_list)/args.batch_size)),
-               desc=f'steps', disable=not is_rank_0())
-    actor.eval()
-    with torch.no_grad():
-        for i in range(0, len(input_ids_list), args.batch_size):
-            batch = input_ids_list[i:i+args.batch_size]
-            batch = [i.flip(dims=[0]) for i in batch]
-            batch = torch.nn.utils.rnn.pad_sequence(batch,
-                                                    batch_first=True,
-                                                    padding_value=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0).to(torch.cuda.current_device())
-            batch = batch.flip(dims=[1])
-            attention_mask = batch.ne(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0)
-            outputs = actor.model.generate(batch, attention_mask=attention_mask,
-                                           max_length=args.max_length,
-                                           do_sample=True,
-                                           top_k=50,
-                                           top_p=0.95,
-                                           num_return_sequences=1)
-            outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-            for j in range(batch.size(0)):
-                answers[i +
-                        j]['output'] = outputs[j].split("### Response:")[1].strip()
-            bar.update()
-    jdump(answers, os.path.join(args.answer_path,
-          f'{args.model_name}_answers_rank{rank}.json'))
-    if is_rank_0():
-        logger.info(
-            f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB')
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--strategy',
-                        choices=['naive', 'ddp', 'colossalai_gemini',
-                                 'colossalai_zero2', 'colossalai_zero2_cpu'],
-                        default='naive')
-    parser.add_argument('--model', default='gpt2',
-                        choices=['gpt2', 'bloom', 'opt', 'roberta', 'llama'])
-    parser.add_argument('--model_path', type=str, default=None)
-    parser.add_argument('--model_name', type=str, default='model')
-    parser.add_argument('--dataset', type=str, default=None)
-    parser.add_argument('--batch_size', type=int, default=1)
-    parser.add_argument('--max_datasets_size', type=int, default=None)
-    parser.add_argument('--answer_path', type=str, default="answer")
-    parser.add_argument('--max_length', type=int, default=1024)
-    args = parser.parse_args()
-    generate(args)
--- a/applications/Chat/evaluate/generate_answers.sh
+++ b/applications/Chat/evaluate/generate_answers.sh
-device_number=number of your devices
-model_name="name of your model"
-model_path="path to your model"
-dataset="path to the question dataset"
-answer_path="path to save the model answers"
-torchrun --standalone --nproc_per_node=$device_number generate_answers.py \
-    --model 'llama' \
-    --strategy ddp \
-    --model_path $model_path \
-    --model_name $model_name \
-    --dataset $dataset \
-    --batch_size 8 \
-    --max_datasets_size 80 \
-    --answer_path $answer_path \
-    --max_length 512
-python merge.py \
-    --model_name $model_name \
-    --shards $device_number \
-    --answer_path $answer_path \
-for (( i=0; i<device_number; i++ )) do
-    rm -rf "${answer_path}/${model_name}_answers_rank${i}.json"
-done
--- a/applications/Chat/evaluate/generate_gpt35_answers.py
+++ b/applications/Chat/evaluate/generate_gpt35_answers.py
-#    Adapted form https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/qa_baseline_gpt35.py
-#    Copyright 2023 LM-SYS@FastChat
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#        http://www.apache.org/licenses/LICENSE-2.0
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-import argparse
-import json
-import os
-import time
-import concurrent.futures
-import openai
-import tqdm
-import shortuuid
-import logging
-from utils import jload, jdump
-MODEL = 'gpt-3.5-turbo'
-MAX_API_RETRY = 3
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-def get_answer(question: str, max_tokens: int):
-    answer = question
-    prompt = question['instruction'] if question['input'] == "" else question['instruction'] + \
-            " " + question['input']
-    for _ in range(MAX_API_RETRY):
-        try:
-            response = openai.ChatCompletion.create(
-                model='gpt-3.5-turbo',
-                messages=[{
-                    'role': 'system',
-                    'content': 'You are a helpful assistant.'
-                }, {
-                    'role': 'user',
-                    'content': prompt,
-                }],
-                max_tokens=max_tokens,
-            )
-            answer['output'] = response['choices'][0]['message']['content']
-            return answer
-        except Exception as e:
-            logger.error(e)
-            time.sleep(1)
-    logger.error(f' Answer {question["id"]} failed after {MAX_API_RETRY} retries.')
-    return answer
-def evaluate_gpt35(args):
-    questions=jload(args.dataset)
-    logger.info(
-        f' Total number of answers: {len(questions)}.')
-    logger.info(
-        f' Waiting for {args.request_time_gap} seconds before sending the next request.')
-    answers = []
-    with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor:
-        futures = []
-        for question in questions:
-            future = executor.submit(get_answer, question, args.max_tokens)
-            futures.append(future)
-        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
-            answers.append(future.result())
-    answers.sort(key=lambda x: x['id'])
-    jdump(answers, os.path.join(args.answer_path,
-          f'gpt35_answers.json'))
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Evaluate GPT 3.5.')
-    parser.add_argument('--dataset', type=str, default="questions.json")
-    parser.add_argument('--answer_path', type=str, default="answer")
-    parser.add_argument('--num_workers', type=int, default=4)
-    parser.add_argument('--openai_key', type=str, default=None)
-    parser.add_argument('--max_tokens', type=int, default=1024)
-    args = parser.parse_args()
-    if args.openai_key is not None:
-        os.environ["OPENAI_API_KEY"] = args.openai_key
-    openai.api_key = os.getenv("OPENAI_API_KEY")
-    evaluate_gpt35(args)
--- a/applications/Chat/evaluate/generate_gpt35_answers.sh
+++ b/applications/Chat/evaluate/generate_gpt35_answers.sh
-python generate_gpt35_answers.py \
-    --dataset "path to the question dataset" \
-    --answer_path "path to answer folder" \
-    --num_workers 4 \
-    --openai_key "your openai key" \
-    --max_tokens 512 \
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/Chat/evaluate/gpt_evaluate.py
+import concurrent.futures
+import os
+import re
+import time
+from copy import deepcopy
+from typing import Any, Dict, List
+import matplotlib.pyplot as plt
+import numpy as np
+import openai
+import pandas as pd
+import seaborn as sns
+import tqdm
+from utils import jdump, jload
+def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
+    """
+    Get evaluation from GPT-4.
+    Args:
+        sys_prompt: prompt for the system.
+        user_prompt: prompt for the user.
+        id: id of the answers for comparison.
+        max_tokens: the maximum number of tokens to generate in the chat completion.
+    Returns:
+        An evaluation of one comparison.
+    """
+    MAX_API_RETRY = 3
+    for _ in range(MAX_API_RETRY):
+        try:
+            response = openai.ChatCompletion.create(
+                model="gpt-4",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": sys_prompt
+                    },
+                    {
+                        "role": "user",
+                        "content": user_prompt,
+                    },
+                ],
+                temperature=0.2,
+                max_tokens=max_tokens,
+            )
+            evaluation = response["choices"][0]["message"]["content"]
+            return {"evaluation": evaluation, "id": id}
+        except Exception as e:
+            print(e)
+            time.sleep(1)
+    print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.")
+    return {"evaluation": "", "id": id}
+def parse_battle_score(evaluation: str) -> List[float]:
+    """
+    Parse evaluation from GPT-4 and get the scores of model 1 and 2.
+    Args:
+        evaluation: evaluation from GPT-4.
+    Returns:
+        A score pair of two different model answers.
+    """
+    try:
+        pattern = re.compile("([0-9]|10) out of 10")
+        sp = re.findall(pattern, evaluation)
+        if len(re.findall(pattern, evaluation)) == 2:
+            return [float(sp[0]), float(sp[1])]
+        pattern = re.compile("a score of ([0-9]|10)")
+        sp = re.findall(pattern, evaluation)
+        if len(re.findall(pattern, evaluation)) == 2:
+            return [float(sp[0]), float(sp[1])]
+        pattern = re.compile("([0-9]|10)/10")
+        sp = re.findall(pattern, evaluation)
+        if len(re.findall(pattern, evaluation)) == 2:
+            return [float(sp[0]), float(sp[1])]
+        score_pair = evaluation.split("\n")[0]
+        score_pair = score_pair.replace(",", " ")
+        sp = score_pair.split(" ")
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            raise Exception(f"Invalid score pair. Got {evaluation}.")
+    except Exception as e:
+        return [-1, -1]
+def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]) -> List[Dict]:
+    """
+    Use GPT-4 to compare answers of two different models.
+    Args:
+        answer1: answers of model 1.
+        answer2: answers of model 2.
+        prompt_dict: prompt for battle.
+    Returns:
+        Evaluations of all comparison pairs.
+    """
+    assert len(answer1) == len(answer2)
+    handles = []
+    evaluation_file = []
+    total_len = len(answer1)
+    question_idx_list = list(range(total_len))
+    print(f" Total number of answers: {len(answer1)}.")
+    evaluations = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+        futures = []
+        for i in question_idx_list:
+            assert answer1[i]["id"] == answer2[i]["id"]
+            answer_id = answer1[i]["id"]
+            ques = answer1[i]["instruction"] if answer1[i][
+                "input"] == "" else answer1[i]["instruction"] + " " + answer1[i]["input"]
+            cat = answer1[i]["category"]
+            ans1 = answer1[i]["output"]
+            ans2 = answer2[i]["output"]
+            sys_prompt = prompt_dict["system_prompt"]
+            prompt_template = prompt_dict["prompt_template"]
+            prompt = prompt_template.format(
+                question=ques,
+                answer_1=ans1,
+                answer_2=ans2,
+                prompt=prompt_dict["prompt"],
+            )
+            future = executor.submit(get_battle_result, sys_prompt, prompt, answer_id, 2048)
+            futures.append(future)
+        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+            evaluations.append(future.result())
+    evaluations.sort(key=lambda x: x["id"])
+    return evaluations
+def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_path: str) -> None:
+    """
+    Save evaluation results (model 1 vs model 2) from GPT-4.
+    Args:
+        evaluations: evaluation results from GPT-4.
+        name1: model 1 's name.
+        name2: model 2 's name.
+        save_path: path to save battle results.
+    """
+    evaluation_file = deepcopy(evaluations)
+    ans1_score = 0
+    ans2_score = 0
+    better_count = 0
+    worse_count = 0
+    tie_count = 0
+    invalid_count = 0
+    better_file = []
+    worse_file = []
+    tie_file = []
+    invalid_file = []
+    for idx, evaluation in enumerate(evaluations):
+        scores = parse_battle_score(evaluation["evaluation"])
+        evaluation_file[idx]["score"] = scores
+        if scores[0] == -1 and scores[1] == -1:
+            invalid_count += 1
+            invalid_file.append(evaluation_file[idx])
+            print(f'Invalid score pair: {evaluation_file[idx]["id"]}.')
+        else:
+            if scores[0] > scores[1]:
+                worse_count += 1
+                worse_file.append(evaluation_file[idx])
+            elif scores[0] < scores[1]:
+                better_count += 1
+                better_file.append(evaluation_file[idx])
+            else:
+                tie_count += 1
+                tie_file.append(evaluation_file[idx])
+            ans1_score += scores[0]
+            ans2_score += scores[1]
+    prefix = f"{name1}_vs_{name2}"
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    jdump(better_file, os.path.join(save_path, prefix, f"{name2}_better.json"))
+    jdump(worse_file, os.path.join(save_path, prefix, f"{name2}_worse.json"))
+    jdump(tie_file, os.path.join(save_path, prefix, f"{prefix}_tie.json"))
+    jdump(invalid_file, os.path.join(save_path, prefix, f"{prefix}_invalid.json"))
+    jdump(evaluation_file, os.path.join(save_path, prefix, f"{prefix}_evaluations.json"))
+    if os.path.exists(os.path.join(save_path, "battle_results.json")):
+        results = jload(os.path.join(save_path, "battle_results.json"))
+    else:
+        results = {}
+    results[prefix] = {
+        "model": [name1, name2],
+        "better": better_count,
+        "worse": worse_count,
+        "tie": tie_count,
+        "win_rate": better_count / (len(evaluations) - invalid_count),
+        "score": [
+            ans1_score / (len(evaluations) - invalid_count),
+            ans2_score / (len(evaluations) - invalid_count),
+        ],
+    }
+    jdump(results, os.path.join(save_path, "battle_results.json"))
+    print(f"Total {invalid_count} invalid score pair(s).")
+    print(f"Model {name2} has {better_count} better answer(s).")
+    print(f"Model {name2} has {worse_count} worse answer(s).")
+    print(f"{tie_count} answer(s) play(s) to a tie.")
+    print(f"Win rate of model {name2}: {better_count/(len(evaluations)-invalid_count):.2f}")
+    print(f"Model {name1} average score: {ans1_score/(len(evaluations)-invalid_count):.2f}")
+    print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
+def get_gpt35_evaluation(prompt: Dict[str, Any],
+                         inst: Dict[str, Any],
+                         metrics: List[str],
+                         max_tokens: int = 2048) -> Dict[str, Any]:
+    """
+    Use GPT-3.5 to evaluate one model answer.
+    Args:
+        prompt: a dictionary including prompt template, CoT and metrics.
+        inst: the instruction that is needed to be evaluated.
+        metrics: the metrics for evaluation.
+        max_tokens: the maximum number of tokens to generate in the completion.
+    Returns:
+        An evaluation of one answer.
+    """
+    MAX_API_RETRY = 3
+    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
+    answer = inst["output"]
+    inst["evaluation"] = {}
+    for metric in metrics:
+        if prompt["metrics"].get(metric, None) is None:
+            raise Exception(
+                f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
+            )
+        for i in range(MAX_API_RETRY):
+            try:
+                response = openai.Completion.create(
+                    model="text-davinci-003",
+                    prompt=prompt["prompt"].format(
+                        question=question,
+                        answer=answer,
+                        metric=prompt["metrics"][metric],
+                        steps=prompt["CoT"][metric],
+                    ),
+                    logprobs=5,
+                    temperature=0,
+                    max_tokens=max_tokens,
+                )
+                inst["evaluation"][metric] = {
+                    "response": response["choices"][0]["text"],
+                    "logprobs": response["choices"][0]["logprobs"]["top_logprobs"],
+                }
+                break
+            except Exception as e:
+                print(e)
+                time.sleep(1)
+    return inst
+def gpt35_evaluate(
+    answers: List[Dict],
+    prompt: Dict[str, Any],
+    metrics: List[str],
+    category: str,
+) -> List[Dict]:
+    """
+    Use GPT-3.5 to evaluate model answers and save evaluation results.
+    Args:
+        answers: model answers.
+        prompt: prompt for GPT-3.5 evaluation.
+        metrics: metrics for GPT-3.5 evaluation.
+        category: the category of the model answers for evaluation.
+    Returns:
+        Evaluations of the given answers.
+    """
+    print(f"The number of instances of category {category}'s is {len(answers)}.")
+    evaluations = []
+    metrics_str = ", ".join(x for x in metrics)
+    print(f"Category {category}'s metrics are {metrics_str}.")
+    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+        futures = []
+        for inst in answers:
+            future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1)
+            futures.append(future)
+        for future in tqdm.tqdm(
+                concurrent.futures.as_completed(futures),
+                desc=f"{category}: ",
+                total=len(futures),
+        ):
+            evaluations.append(future.result())
+    evaluations.sort(key=lambda x: x["id"])
+    print(f"{category} done.")
+    return evaluations
+def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
+    """
+    Calculate score from log probabilities returned by text-davinci-003.
+    Only openai.Completion can return logprobs.
+    Calculation formula:
+        score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.
+    Ref: https://arxiv.org/abs/2303.16634
+    This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling).
+    Args:
+        logprobs: logprobs returned by openai.Completion.
+    Returns:
+        Score of one answer.
+    """
+    # GPT-3.5 only returns score of 1 to 5.
+    prob = np.zeros(5)
+    for key, value in logprobs.items():
+        # Sometimes the key will be one byte of a unicode character which takes the form of "bytes:\\xe7".
+        # It is meaningless and thus we don't calculate probability.
+        if "bytes" in key:
+            continue
+        # results[0] is the score which corresponds to the key(predicted token).
+        # For example, key "5" corresponds to score 5.
+        results = re.findall(r"\d", key)
+        if len(results) == 1:
+            prob[int(results[0]) - 1] = prob[int(results[0]) - 1] + np.exp(value)
+    score = np.dot(np.arange(1, 6), prob)
+    return score
+def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
+    """
+    Generate statistics for one model.
+    Args:
+        model_name: name of the model for saving statistics.
+        evaluations: evaluations for all of the model answers.
+        save_path: path to save GPT-3.5 evaluation statistics.
+    """
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    data_per_category = {}
+    for evaluation in evaluations:
+        category = evaluation["category"]
+        if evaluation["category"] in data_per_category.keys():
+            data_per_category[category].append(evaluation)
+        else:
+            data_per_category[category] = [evaluation]
+    all_statistics = {}
+    for category, data in data_per_category.items():
+        metrics = data[0]["evaluation"].keys()
+        scores = {metric: [] for metric in metrics}
+        for evaluation in data:
+            for metric in metrics:
+                scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
+        statistics = {}
+        for metric in metrics:
+            arg_sort = np.argsort(scores[metric])
+            statistics[metric] = {}
+            statistics[metric]["avg_score"] = sum(scores[metric]) / len(data)
+            statistics[metric]["best_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[-3:][::-1]}
+            statistics[metric]["worst_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[:3]}
+        all_statistics[category] = statistics
+    jdump(
+        all_statistics,
+        os.path.join(save_path, f"{model_name}_evaluation_statistics.json"),
+    )
+def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None:
+    """
+    Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
+    Args:
+        statistics_path: path to all the models' statistics.
+        save_path: path to save table and visualization results.
+    """
+    if not os.path.exists(statistics_path):
+        raise Exception(f'The given directory "{statistics_path}" doesn\'t exist! No statistics found!')
+    all_statistics = {}
+    for file_name in os.listdir(statistics_path):
+        if file_name.endswith("_evaluation_statistics.json"):
+            model_name = file_name.split("_evaluation_statistics.json")[0]
+            all_statistics[model_name] = jload(os.path.join(statistics_path, file_name))
+    if len(list(all_statistics.keys())) == 0:
+        raise Exception(f'There are no statistics in the given directory "{statistics_path}"!')
+    frame_all = {
+        "model": [],
+        "category": [],
+        "metric": [],
+        "avg_score": [],
+        "best_3": [],
+        "worst_3": [],
+    }
+    frame_per_category = {}
+    for model_name, model_statistics in all_statistics.items():
+        for category, category_statistics in model_statistics.items():
+            if frame_per_category.get(category) is None:
+                frame_per_category[category] = {
+                    "model": [],
+                    "metric": [],
+                    "avg_score": [],
+                    "best_3": [],
+                    "worst_3": [],
+                }
+            for metric, metric_statistics in category_statistics.items():
+                frame_all["model"].append(model_name)
+                frame_all["category"].append(category)
+                frame_all["metric"].append(metric)
+                frame_all["avg_score"].append(metric_statistics["avg_score"])
+                frame_all["best_3"].append(metric_statistics["best_3"])
+                frame_all["worst_3"].append(metric_statistics["worst_3"])
+                frame_per_category[category]["model"].append(model_name)
+                frame_per_category[category]["metric"].append(metric)
+                frame_per_category[category]["avg_score"].append(metric_statistics["avg_score"])
+                frame_per_category[category]["best_3"].append(metric_statistics["best_3"])
+                frame_per_category[category]["worst_3"].append(metric_statistics["worst_3"])
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    frame_all = pd.DataFrame(frame_all)
+    frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv"))
+    for category in tqdm.tqdm(
+            frame_per_category.keys(),
+            desc=f"category: ",
+            total=len(frame_per_category.keys()),
+    ):
+        data = pd.DataFrame(frame_per_category[category])
+        sns.set()
+        fig = plt.figure(figsize=(16, 10))
+        plt.ylim((0, 5))
+        fig = sns.barplot(x="metric", y="avg_score", hue="model", data=data, dodge=True)
+        fig.set_title(f"Comparison between Different Models for Category {category.title()}")
+        plt.xlabel("Evaluation Metric")
+        plt.ylabel("Average Score")
+        figure = fig.get_figure()
+        figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)
--- a/applications/Chat/evaluate/merge.py
+++ b/applications/Chat/evaluate/merge.py
-import argparse
-import os
-from utils import jload, jdump
-def generate(args):
-    dataset = []
-    for i in range(args.shards):
-        shard = jload(os.path.join(args.answer_path,
-                      f'{args.model_name}_answers_rank{i}.json'))
-        dataset.extend(shard)
-    dataset.sort(key=lambda x: x['id'])
-    jdump(dataset, os.path.join(args.answer_path,
-                                f'{args.model_name}_answers.json'))
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name', type=str, default='model')
-    parser.add_argument('--shards', type=int, default=4)
-    parser.add_argument('--answer_path', type=str, default="answer")
-    args = parser.parse_args()
-    generate(args)
--- a/applications/Chat/evaluate/metrics.py
+++ b/applications/Chat/evaluate/metrics.py
+import statistics
+import jieba
+from bert_score import score
+from nltk.translate.bleu_score import sentence_bleu
+from rouge_chinese import Rouge as Rouge_cn
+from sklearn.metrics import f1_score, precision_score, recall_score
+def bleu_score(preds: list, targets: list) -> dict:
+    """Calculate BLEU Score Metric
+    The calculation includes BLEU-1 for unigram, BLEU-2 for bigram,
+    BLEU-3 for trigram and BLEU-4 for 4-gram. Unigram evaluates the
+    accuracy in word level, other n-gram evaluate the fluency in
+    sentence level.
+    """
+    bleu_scores = {"bleu1": 0, "bleu2": 0, "bleu3": 0, "bleu4": 0}
+    cumulative_bleu = [0] * 4
+    weights = [(1. / 1., 0., 0., 0.), (1. / 2., 1. / 2., 0., 0.), (1. / 3., 1. / 3., 1. / 3., 0.),
+               (1. / 4., 1. / 4., 1. / 4., 1. / 4.)]
+    for pred, target in zip(preds, targets):
+        pred_list = (' '.join(jieba.cut(pred))).split()
+        target_list = [(' '.join(jieba.cut(target))).split()]
+        bleu = sentence_bleu(target_list, pred_list, weights=weights)
+        cumulative_bleu = [a + b for a, b in zip(cumulative_bleu, bleu)]
+    for i in range(len(cumulative_bleu)):
+        bleu_scores[f"bleu{i+1}"] = cumulative_bleu[i] / len(preds)
+    return bleu_scores
+def rouge_cn_score(preds: list, targets: list) -> dict:
+    """Calculate Chinese ROUGE Score Metric
+    The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
+    and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
+    the preds and targets. ROUGE-L measures the number of matching
+    longest common subsequence (LCS) between preds and targets.
+    """
+    rouge_scores = {"rouge1": {}, "rouge2": {}, "rougeL": {}}
+    all_preds = []
+    all_targets = []
+    for pred, target in zip(preds, targets):
+        pred_list = ' '.join(jieba.cut(pred))
+        target_list = ' '.join(jieba.cut(target))
+        all_preds.append(pred_list)
+        all_targets.append(target_list)
+    rouge_cn = Rouge_cn()
+    rouge_avg = rouge_cn.get_scores(all_preds, all_targets, avg=True)
+    rouge_scores["rouge1"] = rouge_avg["rouge-1"]["f"]
+    rouge_scores["rouge2"] = rouge_avg["rouge-2"]["f"]
+    rouge_scores["rougeL"] = rouge_avg["rouge-l"]["f"]
+    return rouge_scores
+def distinct_score(preds: list) -> dict:
+    """Calculate Distinct Score Metric
+    This metric refers to https://arxiv.org/abs/1510.03055.
+    It evaluates the diversity of generation text by counting
+    the unique n-grams.
+    """
+    distinct_score = {"distinct": 0}
+    cumulative_distinct = []
+    for pred in preds:
+        pred_seg_list = list(' '.join(jieba.cut(pred)))
+        count_segs = len(pred_seg_list)
+        unique_segs = set(pred_seg_list)
+        count_unique_chars = len(unique_segs)
+        cumulative_distinct.append(count_unique_chars / count_segs)
+    distinct_score["distinct"] = statistics.mean(cumulative_distinct)
+    return distinct_score
+def bert_score(preds: list, targets: list) -> dict:
+    """Calculate BERTScore Metric
+    The BERTScore evaluates the semantic similarity between
+    tokens of preds and targets with BERT.
+    """
+    bert_score = {"bert_score": 0}
+    pred_list = []
+    target_list = []
+    for pred, target in zip(preds, targets):
+        pred_list.append(' '.join(jieba.cut(pred)))
+        target_list.append(' '.join(jieba.cut(target)))
+    _, _, F = score(pred_list, target_list, lang="zh", verbose=True)
+    bert_score["bert_score"] = F.mean().item()
+    return bert_score
+def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
+    """Precision, Recall and F1-Score Calculation
+    The calculation of precision, recall and f1-score is realized by counting
+    the number f overlaps between the preds and target. The comparison length
+    limited by the shorter one of preds and targets. This design is mainly
+    considered for classifiction and extraction categories.
+    """
+    precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0}
+    precision_scores = []
+    recall_scores = []
+    f1_scores = []
+    for pred, target in zip(preds, targets):
+        pred_list = [char for char in pred]
+        target_list = [char for char in target]
+        target_labels = [1] * min(len(target_list), len(pred_list))
+        pred_labels = [int(pred_list[i] == target_list[i]) for i in range(0, min(len(target_list), len(pred_list)))]
+        precision_scores.append(precision_score(target_labels, pred_labels, zero_division=0))
+        recall_scores.append(recall_score(target_labels, pred_labels, zero_division=0))
+        f1_scores.append(f1_score(target_labels, pred_labels, zero_division=0))
+    precision_recall_f1["precision"] = statistics.mean(precision_scores)
+    precision_recall_f1["recall"] = statistics.mean(recall_scores)
+    precision_recall_f1["f1_score"] = statistics.mean(f1_scores)
+    return precision_recall_f1
+def precision(preds: list, targets: list) -> dict:
+    """Calculate Precision Metric
+    (design for classifiction and extraction categories)
+    Calculating precision by counting the number of overlaps between the preds and target.
+    """
+    precision = {"precision": 0}
+    precision["precision"] = calculate_precision_recall_f1(preds, targets)["precision"]
+    return precision
+def recall(preds: list, targets: list) -> dict:
+    """Calculate Recall Metric
+    (design for classifiction and extraction categories)
+    Calculating recall by counting the number of overlaps between the preds and target.
+    """
+    recall = {"recall": 0}
+    recall["recall"] = calculate_precision_recall_f1(preds, targets)["recall"]
+    return recall
+def F1_score(preds: list, targets: list) -> dict:
+    """Calculate F1-score Metric
+    (design for classifiction and extraction categories)
+    Calculating f1-score by counting the number of overlaps between the preds and target.
+    """
+    f1 = {"f1_score": 0}
+    f1["f1_score"] = calculate_precision_recall_f1(preds, targets)["f1_score"]
+    return f1
--- a/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_cn.json
+++ b/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_cn.json
+{
+  "id": 1,
+  "system_prompt": "你是一个检查回答质量的好助手。",
+  "prompt_template": "[问题]\n{question}\n\n[1号AI助手的答案]\n{answer_1}\n\n[1号AI助手答案终止]\n\n[2号AI助手的答案]\n{answer_2}\n\n[2号AI助手答案终止]\n\n[要求]\n{prompt}\n\n",
+  "prompt": "我们需要你评价这两个AI助手回答的性能。\n请对他们的回答的有用性、相关性、准确性、详细程度进行评分。每个AI助手都会得到一个1到10分的总分，分数越高表示整体表现越好。\n请首先输出一行，该行只包含两个数值，分别表示1号和2号AI助手的分数。这两个分数之间要有一个空格。在随后的一行中，请对你的评价作出全面的解释，避免任何潜在的偏见，并确保AI助手回答的顺序不会影响您的判断。"
+}
--- a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
+++ b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
--- a/applications/Chat/evaluate/requirements.txt
+++ b/applications/Chat/evaluate/requirements.txt
+jieba
+bert-score
+rouge_chinese
+scikit-metrics
+nltk
+openai
+seaborn
+pandas
+matplotlib
+numpy
--- a/applications/Chat/evaluate/sample/questions.json
+++ b/applications/Chat/evaluate/sample/questions.json
-[
-    {
-        "id": 0,
-        "instruction": "Help me summarize the following news?",
-        "input": "National Commercial Bank (NCB), Saudi Arabia's largest lender by assets, agreed to buy rival Samba Financial Group for $15 billion in the biggest banking takeover this year.NCB will pay 28.45 riyals ($7.58) for each Samba share, according to a statement on Sunday, valuing it at about 55.7 billion riyals. NCB will offer 0.739 new shares for each Samba share, at the lower end of the 0.736-0.787 ratio the banks set when they signed an initial framework agreement in June.The offer is a 3.5% premium to Samba's Oct. 8 closing price of 27.50 riyals and about 24% higher than the level the shares traded at before the talks were made public. Bloomberg News first reported the merger discussions.The new bank will have total assets of more than $220 billion, creating the Gulf region's third-largest lender. The entity's $46 billion market capitalization nearly matches that of Qatar National Bank QPSC, which is still the Middle East's biggest lender with about $268 billion of assets.",
-        "output": "NCB to pay 28.45 riyals for each Samba share. Deal will create Gulf region's third-largest lender",
-        "category": "closed qa"
-    }
-]
\ No newline at end of file
--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
@@ -2,10 +2,6 @@ import io
 import json
 import os
-import torch.distributed as dist
-def is_rank_0() -> bool:
-    return not dist.is_initialized() or dist.get_rank() == 0
 def _make_w_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
@@ -15,11 +11,13 @@ def _make_w_io_base(f, mode: str):
        f = open(f, mode=mode)
    return f
 def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f
 def jdump(obj, f, mode="w", indent=4, default=str):
    """Dump a str or dictionary to a file in json format.
    Args:
@@ -38,6 +36,7 @@ def jdump(obj, f, mode="w", indent=4, default=str):
        raise ValueError(f"Unexpected type: {type(obj)}")
    f.close()
 def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
@@ -45,9 +44,19 @@ def jload(f, mode="r"):
    f.close()
    return jdict
 def get_json_list(file_path):
    with open(file_path, 'r') as f:
        json_list = []
        for line in f:
            json_list.append(json.loads(line))
        return json_list
+def get_data_per_category(data, categories):
+    data_per_category = {category: [] for category in categories}
+    for item in data:
+        category = item["category"]
+        data_per_category[category].append(item)
+    return data_per_category