[evaluation] improvement on evaluation (#3862)

* fix a bug when the config file contains one category but the answer file doesn't contains that category * fix Chinese prompt file * support gpt-3.5-turbo and gpt-4 evaluation * polish and update README * resolve pr comments --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>

[evaluation] improvement on evaluation (#3862)
* fix a bug when the config file contains one category but the answer file doesn't contains that category * fix Chinese prompt file * support gpt-3.5-turbo and gpt-4 evaluation * polish and update README * resolve pr comments --------- Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>
2506e275 · Yuanchen · GitHub · b0474878 · 2506e275 · 2506e275
Unverified Commit 2506e275 authored May 30, 2023 by Yuanchen Committed by GitHub May 30, 2023
7 changed files
--- a/applications/Chat/evaluate/README.md
+++ b/applications/Chat/evaluate/README.md
--- a/applications/Chat/evaluate/config/config_cn.json
+++ b/applications/Chat/evaluate/config/config_cn.json
@@ -2,7 +2,7 @@
  "language": "cn",
  "category": {
    "brainstorming": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "creativity",
@@ -14,7 +14,7 @@
      ]
    },
    "chat": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "naturalness",
@@ -26,7 +26,7 @@
      ]
    },
    "classification": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "correctness"
@@ -38,7 +38,7 @@
      ]
    },
    "closed_qa": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "correctness"
@@ -50,7 +50,7 @@
      ]
    },
    "extraction": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "correctness"
@@ -62,7 +62,7 @@
      ]
    },
    "generation": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "diversity"
@@ -74,7 +74,7 @@
      ]
    },
    "open_qa": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "correctness"
@@ -84,7 +84,7 @@
      ]
    },
    "rewriting": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "correctness"
@@ -96,7 +96,7 @@
      ]
    },
    "roleplay": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "fidelity",
@@ -107,7 +107,7 @@
      ]
    },
    "summarization": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "correctness",

--- a/applications/Chat/evaluate/eval.py
+++ b/applications/Chat/evaluate/eval.py
@@ -39,7 +39,8 @@ def main(args):
                "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")

        # initialize evaluator
-        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
+        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
+                              config["language"])
        if len(args.model_name_list) == 2:
            answers1 = jload(args.answer_file_list[0])
            answers2 = jload(args.answer_file_list[1])
@@ -87,6 +88,10 @@ if __name__ == '__main__':
                        default=[],
                        required=True,
                        help='the names of at most 2 models')
+    parser.add_argument('--gpt_model',
+                        default="gpt-3.5-turbo",
+                        choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
+                        help='which GPT model to use for evaluation')
    parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
    parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
    args = parser.parse_args()

--- a/applications/Chat/evaluate/evaluator.py
+++ b/applications/Chat/evaluate/evaluator.py
@@ -14,13 +14,15 @@ class Evaluator(object):

    """

-    def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str,
-                                                                                                          Any]) -> None:
+    def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
+                 gpt_model: str, language: str) -> None:
        self.params = params
        self.battle_prompt = battle_prompt
        self.gpt_evaluation_prompt = gpt_evaluation_prompt
+        self.gpt_model = gpt_model
+        self.language = language
        self.automatic_metric_stats = dict()
-        self.gpt35_evaluation_results = dict()
+        self.gpt_evaluation_results = dict()
        self.battle_results = []

    def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
@@ -63,6 +65,10 @@ class Evaluator(object):

        # automatic evaluation
        for category in self.params:
+            if len(answers_per_category[category]) == 0:
+                print(f"Category {category} specified in your config doesn't have corresponding answers!")
+                continue
+
            category_metrics = self.params[category]["Metrics"]
            self.automatic_metric_stats[category] = {}

@@ -74,17 +80,21 @@ class Evaluator(object):
            for metric in category_metrics:
                self.automatic_metric_stats[category].update(switch(metric=metric))

-        # gpt35 evaluation
+        # gpt evaluation
        for category in self.params:
-            category_metrics = self.params[category]["GPT-3.5"]
+            if len(answers_per_category[category]) == 0:
+                print(f"Category {category} specified in your config doesn't have corresponding answers!")
+                continue
+
+            category_metrics = self.params[category]["GPT"]

            prompt = self.gpt_evaluation_prompt.get(category, None)
            if prompt is None:
                print(f"No prompt for category {category}! Use prompt for category general now.")
                prompt = self.gpt_evaluation_prompt["general"]

-            self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category],
-                                                                                  prompt, category_metrics, category)
+            self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(answers_per_category[category], prompt,
+                                                                          category_metrics, category, self.gpt_model)

    def save(self, path: str, model_name_list: List[str]) -> None:
        """
@@ -106,10 +116,10 @@ class Evaluator(object):

            # Save evaluation results for GPT-3.5 evaluation metrics.
            all_evaluations = []
-            base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results")
+            base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
            evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")

-            for category, evaluations in self.gpt35_evaluation_results.items():
+            for category, evaluations in self.gpt_evaluation_results.items():
                jdump(
                    evaluations,
                    os.path.join(evaluation_results_save_path, model_name_list[0],
@@ -121,10 +131,10 @@ class Evaluator(object):

            # Start to calculate scores and save statistics.
            evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
-            gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations,
-                                                          evaluation_statistics_save_path)
+            gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
+                                                        evaluation_statistics_save_path)

            # Save charts and csv.
            evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
-            gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path,
-                                                             evaluation_analyses_save_path)
+            gpt_evaluate.analyze_gpt_evaluation_statistics(evaluation_statistics_save_path,
+                                                           evaluation_analyses_save_path)
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -16,7 +16,7 @@ from utils import jdump, jload

 def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
    """
-    Get evaluation from GPT-4.
+    Get battle evaluation from GPT-4.

    Args:
        sys_prompt: prompt for the system.
@@ -51,7 +51,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in
        except Exception as e:
            print(e)
            time.sleep(1)
-    print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.")
+    print(f"Evaluation {id} failed after {MAX_API_RETRY} retries.")
    return {"evaluation": "", "id": id}


@@ -233,12 +233,77 @@ def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_pa
    print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")


-def get_gpt35_evaluation(prompt: Dict[str, Any],
-                         inst: Dict[str, Any],
-                         metrics: List[str],
-                         max_tokens: int = 2048) -> Dict[str, Any]:
+def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
+                                        inst: Dict[str, Any],
+                                        metrics: List[str],
+                                        model: str = "gpt-3.5-turbo",
+                                        max_tokens: int = 2048) -> Dict[str, Any]:
    """
-    Use GPT-3.5 to evaluate one model answer.
+    Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
+
+    Args:
+        prompt: a dictionary including prompt template, CoT and metrics.
+        inst: the instruction that is needed to be evaluated.
+        metrics: the metrics for evaluation.
+        model: the model used to evaluate answers.
+        max_tokens: the maximum number of tokens to generate in the chat completion.
+
+    Returns:
+        An evaluation of one answer.
+    """
+
+    MAX_API_RETRY = 3
+
+    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
+    answer = inst["output"]
+    inst["evaluation"] = {}
+
+    for metric in metrics:
+        if prompt["metrics"].get(metric, None) is None:
+            raise Exception(
+                f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
+            )
+        for i in range(MAX_API_RETRY):
+            try:
+                response = openai.ChatCompletion.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role":
+                                "user",
+                            "content":
+                                prompt["prompt"].format(
+                                    question=question,
+                                    answer=answer,
+                                    metric=prompt["metrics"][metric],
+                                    steps=prompt["CoT"][metric],
+                                ),
+                        },
+                    ],
+                    temperature=0,
+                    max_tokens=max_tokens,
+                )
+                inst["evaluation"][metric] = {
+                    "response": response["choices"][0]["message"]["content"],
+                    "logprobs": None,
+                }
+                break
+            except Exception as e:
+                print(e)
+                time.sleep(1)
+        if metric not in inst["evaluation"]:
+            print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
+            inst["evaluation"][metric] = {}
+    return inst
+
+
+def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
+                                     inst: Dict[str, Any],
+                                     metrics: List[str],
+                                     max_tokens: int = 2048) -> Dict[str, Any]:
+    """
+    Use completion model(text-davinci-003) to evaluate one model answer.
+    Only completion models can return log probabilities.

    Args:
        prompt: a dictionary including prompt template, CoT and metrics.
@@ -283,23 +348,22 @@ def get_gpt35_evaluation(prompt: Dict[str, Any],
            except Exception as e:
                print(e)
                time.sleep(1)
+        if metric not in inst["evaluation"]:
+            print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
+            inst["evaluation"][metric] = {}
    return inst


-def gpt35_evaluate(
-    answers: List[Dict],
-    prompt: Dict[str, Any],
-    metrics: List[str],
-    category: str,
-) -> List[Dict]:
+def evaluate(answers: List[Dict], prompt: Dict[str, Any], metrics: List[str], category: str, model: str) -> List[Dict]:
    """
-    Use GPT-3.5 to evaluate model answers and save evaluation results.
+    Use GPT models to evaluate model answers and save evaluation results.

    Args:
        answers: model answers.
-        prompt: prompt for GPT-3.5 evaluation.
-        metrics: metrics for GPT-3.5 evaluation.
+        prompt: prompt for GPT evaluation.
+        metrics: metrics for GPT evaluation.
        category: the category of the model answers for evaluation.
+        model: the specific GPT model used to evaluate answers.

    Returns:
        Evaluations of the given answers.
@@ -315,7 +379,12 @@ def gpt35_evaluate(
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for inst in answers:
-            future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1)
+            # Completion models can return log probabilities.
+            if model == "text-davinci-003":
+                future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
+            else:
+                future = executor.submit(get_gpt_evaluation_without_logprobs, prompt, inst, metrics, model, 1)
+
            futures.append(future)

        for future in tqdm.tqdm(
@@ -334,20 +403,19 @@ def gpt35_evaluate(

 def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
    """
-    Calculate score from log probabilities returned by text-davinci-003.
-    Only openai.Completion can return logprobs.
+    Calculate the score according to log probabilities returned by text-davinci-003.

    Calculation formula:
        score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.

    Ref: https://arxiv.org/abs/2303.16634
-    This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling).
+    This paper proposes NLG evaluation methods using text-davinci-003(log probabilities returned by completion models) and GPT-4(probabilities obtained by sampling).

    Args:
        logprobs: logprobs returned by openai.Completion.

    Returns:
-        Score of one answer.
+        The score of one answer.
    """

    # GPT-3.5 only returns score of 1 to 5.
@@ -369,7 +437,31 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
    return score


-def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
+def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
+    """
+    Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
+    Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
+    Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
+
+    Args:
+        response: logprobs returned by openai.Completion.
+        evaluation: the evaluation corresponds to the question.
+
+    Returns:
+        The score of one answer.
+    """
+
+    try:
+        results = re.findall(r"\d", response)
+        if len(results) == 1:
+            return int(results[0])
+        else:
+            raise Exception(f"Invalid score pair. Got {evaluation}.")
+    except Exception as e:
+        return 0
+
+
+def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
    """
    Generate statistics for one model.

@@ -396,7 +488,15 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
        scores = {metric: [] for metric in metrics}
        for evaluation in data:
            for metric in metrics:
-                scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
+                if evaluation["evaluation"][metric] == {}:
+                    # This means after 3 retries, the server still returns an error and we set the score to 0.
+                    scores[metric].append(0)
+                elif evaluation["evaluation"][metric]["logprobs"] is not None:
+                    scores[metric].append(
+                        calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
+                else:
+                    scores[metric].append(
+                        calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation))

        statistics = {}
        for metric in metrics:
@@ -414,7 +514,7 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
    )


-def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None:
+def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
    """
    Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.

@@ -474,7 +574,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->
        os.makedirs(save_path)

    frame_all = pd.DataFrame(frame_all)
-    frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv"))
+    frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))

    for category in tqdm.tqdm(
            frame_per_category.keys(),

--- a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
+++ b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
-[
-  {
+{
+  "brainstorming": {
    "id": 1,
    "category": "brainstorming",
    "metrics": {
@@ -18,7 +18,7 @@
    },
    "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "chat": {
    "id": 2,
    "category": "chat",
    "metrics": {
@@ -37,7 +37,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的“补全对话”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "classification": {
    "id": 3,
    "category": "classification",
    "metrics": {
@@ -52,7 +52,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的“分类“问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "closed_qa": {
    "id": 4,
    "category": "closed_qa",
    "metrics": {
@@ -67,7 +67,7 @@
    },
    "prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下：\n\n{question}\n\n需要你评分的答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "extraction": {
    "id": 5,
    "category": "extraction",
    "metrics": {
@@ -82,7 +82,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的“提取”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "generation": {
    "id": 6,
    "category": "generation",
    "metrics": {
@@ -97,7 +97,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的“生成”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "open_qa": {
    "id": 7,
    "category": "open_qa",
    "metrics": {
@@ -112,7 +112,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "rewriting": {
    "id": 8,
    "category": "rewriting",
    "metrics": {
@@ -127,7 +127,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "roleplay": {
    "id": 9,
    "category": "roleplay",
    "metrics": {
@@ -144,7 +144,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的“角色扮演”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "summarization": {
    "id": 10,
    "category": "summarization",
    "metrics": {
@@ -161,7 +161,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的“总结”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "general": {
    "id": 11,
    "category": "general",
    "metrics": {
@@ -176,4 +176,4 @@
    },
    "prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下：\n\n{question}\n\n需要你评分的答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  }
-]
+}
--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
@@ -57,6 +57,7 @@ def get_data_per_category(data, categories):
    data_per_category = {category: [] for category in categories}
    for item in data:
        category = item["category"]
-        data_per_category[category].append(item)
+        if category in categories:
+            data_per_category[category].append(item)

    return data_per_category