Merge pull request #3915 from FrankLeeeee/update/develop

[sync] update develop with main

Merge pull request #3915 from FrankLeeeee/update/develop
[sync] update develop with main
c622bb36 · Frank Lee · GitHub · 34966378 · 9c88b6cb · c622bb36
Unverified Commit c622bb36 authored Jun 07, 2023 by Frank Lee Committed by GitHub Jun 07, 2023
20 changed files
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@@ -130,3 +130,7 @@ class Strategy(ABC):
                        only_rank0: bool = True,
                        tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
        pass
+
+    @abstractmethod
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        pass
\ No newline at end of file
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -186,3 +186,15 @@ class ColossalAIStrategy(DDPStrategy):
        if self.stage == 3:
            raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now')
        super().save_pretrained(model, path, only_rank0, tokenizer)
+
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        if self.stage != 3:
+            yield from super().get_model_state_dict_shard(model, **config)
+        else:
+            # unwrapped_model = self._unwrap_model(model)
+            # for module in unwrapped_model.modules():
+            #     if isinstance(module, LoraLinear):
+            #         module.merge_weights = True
+            #         module.eval()
+            base_model: ZeroDDP = get_base_model(model)
+            yield from base_model.state_dict_shard(max_shard_size=1024, only_rank_0=False)
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -26,19 +26,8 @@ class DDPStrategy(NaiveStrategy):
        super().__init__()

    def setup_distributed(self) -> None:
-        try:
-            rank = int(os.environ['RANK'])
-            local_rank = int(os.environ['LOCAL_RANK'])
-            world_size = int(os.environ['WORLD_SIZE'])
-            host = os.environ['MASTER_ADDR']
-            port = int(os.environ['MASTER_PORT'])
-        except KeyError as e:
-            raise RuntimeError(
-                f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
-            )
-        dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
+        self._try_init_dist(force=True)
        self.set_seed(self.seed)
-        torch.cuda.set_device(local_rank)

    def set_seed(self, seed: int) -> None:
        random.seed(seed)

--- a/applications/Chat/coati/trainer/strategies/naive.py
+++ b/applications/Chat/coati/trainer/strategies/naive.py
-from typing import Any, Optional
+import os
+import sys
+from collections import OrderedDict
+from typing import Any, Dict, Optional

 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import torch.optim as optim
 from coati.models.base import get_base_model
 from coati.replay_buffer import ReplayBuffer
+from coati.models.base import RewardModel
+from coati.models.lora import LoraLinear
+from coati.replay_buffer import ReplayBuffer
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 from transformers.modeling_utils import PreTrainedModel
@@ -13,6 +20,15 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from .base import Strategy


+# TODO Move this to a util.py   (Moving to ray.util introduces ringed import)
+def get_grad_required_state_dict(model: nn.Module):
+    state_dict = OrderedDict()
+    for name, parameter in model.named_parameters():
+        if parameter.requires_grad:
+            state_dict[name] = parameter.detach()
+    return state_dict
+
+
 class NaiveStrategy(Strategy):
    """
        Strategy for single GPU. No parallelism is used.
@@ -25,7 +41,7 @@ class NaiveStrategy(Strategy):
        optimizer.step()

    def setup_distributed(self) -> None:
-        pass
+        self._try_init_dist(force=False)

    def setup_model(self, model: nn.Module) -> nn.Module:
        return model
@@ -68,3 +84,45 @@ class NaiveStrategy(Strategy):
        unwrapped_model.save_pretrained(path)
        if tokenizer is not None:
            tokenizer.save_pretrained(path)
+
+    def get_model_state_dict_shard(self, model: nn.Module, **config):
+        # TODO: implement sharding on naive strategy
+        model = self.unwrap_model(model)
+        if 'requires_grad_only' in config and config['requires_grad_only'] == True:
+            state_dict = get_grad_required_state_dict(model)
+        else:
+            state_dict = model.state_dict()
+
+        if 'shard_size' in config:
+            shard_size = config['shard_size']
+            accumulate_size = 0
+            state_dict_shard = OrderedDict()
+            for name, param in state_dict.items():
+                state_dict_shard[name] = param
+                accumulate_size += param.numel() * param.element_size()
+                if accumulate_size >= shard_size:
+                    accumulate_size = 0
+                    yield state_dict_shard
+                    state_dict_shard = OrderedDict()
+            if accumulate_size > 0:
+                yield state_dict_shard
+        else:
+            yield state_dict
+
+    def _try_init_dist(self, force: bool = False) -> None:
+        try:
+            rank = int(os.environ['RANK'])
+            local_rank = int(os.environ['LOCAL_RANK'])
+            world_size = int(os.environ['WORLD_SIZE'])
+            host = os.environ['MASTER_ADDR']
+            port = int(os.environ['MASTER_PORT'])
+            dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
+            torch.cuda.set_device(local_rank)
+        except KeyError as e:
+            if force:
+                raise RuntimeError(
+                    f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
+                )
+        except Exception as e:
+            if force:
+                raise e
--- a/applications/Chat/coati/trainer/strategies/sampler.py
+++ b/applications/Chat/coati/trainer/strategies/sampler.py
@@ -27,6 +27,7 @@ class DistributedSampler:
        assert len(indices) == self.num_samples
        self.indices = indices

+
    def sample(self, batch_size: int) -> list:
        sampled_indices = np.random.choice(self.indices, batch_size, replace=False)
        return [self.dataset[idx] for idx in sampled_indices]
--- a/applications/Chat/evaluate/README.md
+++ b/applications/Chat/evaluate/README.md
--- a/applications/Chat/evaluate/config/config_cn.json
+++ b/applications/Chat/evaluate/config/config_cn.json
@@ -2,7 +2,7 @@
  "language": "cn",
  "category": {
    "brainstorming": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "creativity",
@@ -14,7 +14,7 @@
      ]
    },
    "chat": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "naturalness",
@@ -26,7 +26,7 @@
      ]
    },
    "classification": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "correctness"
@@ -38,7 +38,7 @@
      ]
    },
    "closed_qa": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "correctness"
@@ -50,7 +50,7 @@
      ]
    },
    "extraction": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "correctness"
@@ -62,7 +62,7 @@
      ]
    },
    "generation": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "diversity"
@@ -74,7 +74,7 @@
      ]
    },
    "open_qa": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "correctness"
@@ -84,7 +84,7 @@
      ]
    },
    "rewriting": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "correctness"
@@ -96,7 +96,7 @@
      ]
    },
    "roleplay": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "fidelity",
@@ -107,7 +107,7 @@
      ]
    },
    "summarization": {
-      "GPT-3.5": [
+      "GPT": [
        "language organization",
        "relevance",
        "correctness",

--- a/applications/Chat/evaluate/config/config_en.json
+++ b/applications/Chat/evaluate/config/config_en.json
+{
+  "language": "en",
+  "category": {
+    "brainstorming": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "creativity",
+        "practicality",
+        "correctness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "chat": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "naturalness",
+        "engagingness",
+        "reasonableness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "classification": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Precision",
+        "Recall",
+        "F1 score"
+      ]
+    },
+    "closed_qa": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "extraction": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Precision",
+        "Recall",
+        "F1 score"
+      ]
+    },
+    "generation": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "diversity"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "open_qa": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "rewriting": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "roleplay": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "fidelity",
+        "creativity"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "summarization": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness",
+        "conciseness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    }
+  }
+}
--- a/applications/Chat/evaluate/eval.py
+++ b/applications/Chat/evaluate/eval.py
@@ -14,7 +14,7 @@ def main(args):
    # load config
    config = jload(args.config_file)

-    if config["language"] == "cn":
+    if config["language"] in ["cn", "en"]:
        # get metric settings for all categories
        metrics_per_category = {}
        for category in config["category"].keys():
@@ -39,7 +39,8 @@ def main(args):
                "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")

        # initialize evaluator
-        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
+        evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
+                              config["language"])
        if len(args.model_name_list) == 2:
            answers1 = jload(args.answer_file_list[0])
            answers2 = jload(args.answer_file_list[1])
@@ -87,6 +88,10 @@ if __name__ == '__main__':
                        default=[],
                        required=True,
                        help='the names of at most 2 models')
+    parser.add_argument('--gpt_model',
+                        default="gpt-3.5-turbo",
+                        choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
+                        help='which GPT model to use for evaluation')
    parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
    parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
    args = parser.parse_args()

--- a/applications/Chat/evaluate/evaluator.py
+++ b/applications/Chat/evaluate/evaluator.py
@@ -4,7 +4,7 @@ from typing import Any, Dict, List
 import gpt_evaluate
 import metrics
 import pandas as pd
-from utils import get_data_per_category, jdump
+from utils import analyze_automatic_results, get_data_per_category, save_automatic_results


 class Evaluator(object):
@@ -14,13 +14,15 @@ class Evaluator(object):

    """

-    def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str,
-                                                                                                          Any]) -> None:
+    def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
+                 gpt_model: str, language: str) -> None:
        self.params = params
        self.battle_prompt = battle_prompt
        self.gpt_evaluation_prompt = gpt_evaluation_prompt
+        self.gpt_model = gpt_model
+        self.language = language
        self.automatic_metric_stats = dict()
-        self.gpt35_evaluation_results = dict()
+        self.gpt_evaluation_results = dict()
        self.battle_results = []

    def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
@@ -40,21 +42,21 @@ class Evaluator(object):

        """

-        def switch(metric):
+        def switch(metric, language):
            if metric == "BLEU":
-                return metrics.bleu_score(preds=predicts_list, targets=targets_list)
+                return metrics.bleu_score(preds=predicts_list, targets=targets_list, language=language)
            elif metric == "ROUGE":
-                return metrics.rouge_cn_score(preds=predicts_list, targets=targets_list)
+                return metrics.rouge_score(preds=predicts_list, targets=targets_list, language=language)
            elif (metric == "Distinct"):
-                return metrics.distinct_score(preds=predicts_list)
+                return metrics.distinct_score(preds=predicts_list, language=language)
            elif (metric == "BERTScore"):
-                return metrics.bert_score(preds=predicts_list, targets=targets_list)
+                return metrics.bert_score(preds=predicts_list, targets=targets_list, language=language)
            elif (metric == "Precision"):
-                return metrics.precision(preds=predicts_list, targets=targets_list)
+                return metrics.precision(preds=predicts_list, targets=targets_list, language=language)
            elif (metric == "Recall"):
-                return metrics.recall(preds=predicts_list, targets=targets_list)
+                return metrics.recall(preds=predicts_list, targets=targets_list, language=language)
            elif (metric == "F1 score"):
-                return metrics.F1_score(preds=predicts_list, targets=targets_list)
+                return metrics.F1_score(preds=predicts_list, targets=targets_list, language=language)
            else:
                raise ValueError(f"Unexpected metric")

@@ -63,6 +65,10 @@ class Evaluator(object):

        # automatic evaluation
        for category in self.params:
+            if len(answers_per_category[category]) == 0:
+                print(f"Category {category} specified in your config doesn't have corresponding answers!")
+                continue
+
            category_metrics = self.params[category]["Metrics"]
            self.automatic_metric_stats[category] = {}

@@ -72,19 +78,23 @@ class Evaluator(object):
            predicts_list = [answer["output"] for answer in answers_per_category[category]]

            for metric in category_metrics:
-                self.automatic_metric_stats[category].update(switch(metric=metric))
+                self.automatic_metric_stats[category].update(switch(metric=metric, language=self.language))

-        # gpt35 evaluation
+        # gpt evaluation
        for category in self.params:
-            category_metrics = self.params[category]["GPT-3.5"]
+            if len(answers_per_category[category]) == 0:
+                print(f"Category {category} specified in your config doesn't have corresponding answers!")
+                continue
+
+            category_metrics = self.params[category]["GPT"]

            prompt = self.gpt_evaluation_prompt.get(category, None)
            if prompt is None:
                print(f"No prompt for category {category}! Use prompt for category general now.")
                prompt = self.gpt_evaluation_prompt["general"]

-            self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category],
-                                                                                  prompt, category_metrics, category)
+            self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(answers_per_category[category], prompt,
+                                                                          category_metrics, category, self.gpt_model)

    def save(self, path: str, model_name_list: List[str]) -> None:
        """
@@ -96,35 +106,29 @@ class Evaluator(object):
            save_path = os.path.join(path, "gpt_evaluate", "battle_results")
            gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
        else:
-            # save evaluation results for automatic metrics
-            automatic_df = pd.DataFrame(self.automatic_metric_stats)
-
-            automatic_results_save_path = os.path.join(path, "automatic_results")
-            if not os.path.exists(automatic_results_save_path):
-                os.makedirs(automatic_results_save_path)
-            automatic_df.to_csv(os.path.join(automatic_results_save_path, f"{model_name_list[0]}.csv"), index=True)
-
-            # Save evaluation results for GPT-3.5 evaluation metrics.
-            all_evaluations = []
-            base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results")
-            evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
-
-            for category, evaluations in self.gpt35_evaluation_results.items():
-                jdump(
-                    evaluations,
-                    os.path.join(evaluation_results_save_path, model_name_list[0],
-                                 f"{category}_evaluation_results.json"))
-                all_evaluations.extend(evaluations)
-
-            jdump(all_evaluations,
-                  os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json"))
-
-            # Start to calculate scores and save statictics.
-            evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
-            gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations,
-                                                          evaluation_statistics_save_path)
+            # Save evaluation results for automatic metrics
+            automatic_base_save_path = os.path.join(path, "automatic_results")
+            automatic_results_save_path = os.path.join(automatic_base_save_path, "evaluation_results")
+
+            save_automatic_results(model_name_list[0], self.automatic_metric_stats, automatic_results_save_path)
+
+            # Save charts and csv.
+            automatic_analyses_save_path = os.path.join(automatic_base_save_path, "evaluation_analyses")
+            analyze_automatic_results(automatic_results_save_path, automatic_analyses_save_path)
+
+            # Save evaluation results for GPT evaluation metrics.
+            gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
+            gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
+
+            all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0], self.gpt_evaluation_results,
+                                                                       gpt_evaluation_results_save_path)
+
+            # Start to calculate scores and save statistics.
+            gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
+            gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
+                                                        gpt_evaluation_statistics_save_path)

            # Save charts and csv.
-            evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
-            gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path,
-                                                             evaluation_analyses_save_path)
+            gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
+            gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
+                                                           gpt_evaluation_analyses_save_path)
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -16,7 +16,7 @@ from utils import jdump, jload

 def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
    """
-    Get evaluation from GPT-4.
+    Get battle evaluation from GPT-4.

    Args:
        sys_prompt: prompt for the system.
@@ -51,7 +51,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in
        except Exception as e:
            print(e)
            time.sleep(1)
-    print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.")
+    print(f"Evaluation {id} failed after {MAX_API_RETRY} retries.")
    return {"evaluation": "", "id": id}


@@ -233,12 +233,77 @@ def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_pa
    print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")


-def get_gpt35_evaluation(prompt: Dict[str, Any],
-                         inst: Dict[str, Any],
-                         metrics: List[str],
-                         max_tokens: int = 2048) -> Dict[str, Any]:
+def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
+                                        inst: Dict[str, Any],
+                                        metrics: List[str],
+                                        model: str = "gpt-3.5-turbo",
+                                        max_tokens: int = 2048) -> Dict[str, Any]:
    """
-    Use GPT-3.5 to evaluate one model answer.
+    Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
+
+    Args:
+        prompt: a dictionary including prompt template, CoT and metrics.
+        inst: the instruction that is needed to be evaluated.
+        metrics: the metrics for evaluation.
+        model: the model used to evaluate answers.
+        max_tokens: the maximum number of tokens to generate in the chat completion.
+
+    Returns:
+        An evaluation of one answer.
+    """
+
+    MAX_API_RETRY = 3
+
+    question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
+    answer = inst["output"]
+    inst["evaluation"] = {}
+
+    for metric in metrics:
+        if prompt["metrics"].get(metric, None) is None:
+            raise Exception(
+                f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
+            )
+        for i in range(MAX_API_RETRY):
+            try:
+                response = openai.ChatCompletion.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role":
+                                "user",
+                            "content":
+                                prompt["prompt"].format(
+                                    question=question,
+                                    answer=answer,
+                                    metric=prompt["metrics"][metric],
+                                    steps=prompt["CoT"][metric],
+                                ),
+                        },
+                    ],
+                    temperature=0,
+                    max_tokens=max_tokens,
+                )
+                inst["evaluation"][metric] = {
+                    "response": response["choices"][0]["message"]["content"],
+                    "logprobs": None,
+                }
+                break
+            except Exception as e:
+                print(e)
+                time.sleep(1)
+        if metric not in inst["evaluation"]:
+            print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
+            inst["evaluation"][metric] = {}
+    return inst
+
+
+def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
+                                     inst: Dict[str, Any],
+                                     metrics: List[str],
+                                     max_tokens: int = 2048) -> Dict[str, Any]:
+    """
+    Use completion model(text-davinci-003) to evaluate one model answer.
+    Only completion models can return log probabilities.

    Args:
        prompt: a dictionary including prompt template, CoT and metrics.
@@ -283,23 +348,22 @@ def get_gpt35_evaluation(prompt: Dict[str, Any],
            except Exception as e:
                print(e)
                time.sleep(1)
+        if metric not in inst["evaluation"]:
+            print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
+            inst["evaluation"][metric] = {}
    return inst


-def gpt35_evaluate(
-    answers: List[Dict],
-    prompt: Dict[str, Any],
-    metrics: List[str],
-    category: str,
-) -> List[Dict]:
+def evaluate(answers: List[Dict], prompt: Dict[str, Any], metrics: List[str], category: str, model: str) -> List[Dict]:
    """
-    Use GPT-3.5 to evaluate model answers and save evaluation results.
+    Use GPT models to evaluate model answers and save evaluation results.

    Args:
        answers: model answers.
-        prompt: prompt for GPT-3.5 evaluation.
-        metrics: metrics for GPT-3.5 evaluation.
+        prompt: prompt for GPT evaluation.
+        metrics: metrics for GPT evaluation.
        category: the category of the model answers for evaluation.
+        model: the specific GPT model used to evaluate answers.

    Returns:
        Evaluations of the given answers.
@@ -315,7 +379,12 @@ def gpt35_evaluate(
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for inst in answers:
-            future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1)
+            # Completion models can return log probabilities.
+            if model == "text-davinci-003":
+                future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
+            else:
+                future = executor.submit(get_gpt_evaluation_without_logprobs, prompt, inst, metrics, model, 1)
+
            futures.append(future)

        for future in tqdm.tqdm(
@@ -334,20 +403,19 @@ def gpt35_evaluate(

 def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
    """
-    Calculate score from log probabilities returned by text-davinci-003.
-    Only openai.Completion can return logprobs.
+    Calculate the score according to log probabilities returned by text-davinci-003.

    Calculation formula:
        score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.

    Ref: https://arxiv.org/abs/2303.16634
-    This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling).
+    This paper proposes NLG evaluation methods using text-davinci-003(log probabilities returned by completion models) and GPT-4(probabilities obtained by sampling).

    Args:
        logprobs: logprobs returned by openai.Completion.

    Returns:
-        Score of one answer.
+        The score of one answer.
    """

    # GPT-3.5 only returns score of 1 to 5.
@@ -369,14 +437,59 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
    return score


-def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
+def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
+    """
+    Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
+    Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
+    Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
+
+    Args:
+        response: logprobs returned by openai.Completion.
+        evaluation: the evaluation corresponds to the question.
+
+    Returns:
+        The score of one answer.
+    """
+
+    try:
+        results = re.findall(r"\d", response)
+        if len(results) == 1:
+            return int(results[0])
+        else:
+            raise Exception(f"Invalid score pair. Got {evaluation}.")
+    except Exception as e:
+        return 0
+
+
+def save_gpt_evaluation_results(model_name: str, gpt_evaluation_results: Dict[str, Any],
+                                save_path: str) -> Dict[str, Any]:
+    """
+    Save evaluation results for different categories for one model.
+
+    Args:
+        model_name: name of the model for saving evaluation results.
+        gpt_evaluation_results: evaluations results for all of the model answers.
+        save_path: path to save GPT evaluation statistics.
+    """
+
+    all_evaluations = []
+    for category, evaluations in gpt_evaluation_results.items():
+        jdump(evaluations, os.path.join(save_path, model_name, f"{category}_evaluation_results.json"))
+        all_evaluations.extend(evaluations)
+
+    jdump(all_evaluations, os.path.join(save_path, f"{model_name}_evaluation_results.json"))
+
+    return all_evaluations
+
+
+def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
    """
    Generate statistics for one model.

    Args:
        model_name: name of the model for saving statistics.
        evaluations: evaluations for all of the model answers.
-        save_path: path to save GPT-3.5 evaluation statistics.
+        save_path: path to save GPT evaluation statistics.
    """

    if not os.path.exists(save_path):
@@ -396,7 +509,15 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
        scores = {metric: [] for metric in metrics}
        for evaluation in data:
            for metric in metrics:
-                scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
+                if evaluation["evaluation"][metric] == {}:
+                    # This means after 3 retries, the server still returns an error and we set the score to 0.
+                    scores[metric].append(0)
+                elif evaluation["evaluation"][metric]["logprobs"] is not None:
+                    scores[metric].append(
+                        calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
+                else:
+                    scores[metric].append(
+                        calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation))

        statistics = {}
        for metric in metrics:
@@ -414,9 +535,9 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
    )


-def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None:
+def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
    """
-    Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
+    Analyze and visualize all GPT evaluation statistics in the given directory.

    Args:
        statistics_path: path to all the models' statistics.
@@ -474,7 +595,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->
        os.makedirs(save_path)

    frame_all = pd.DataFrame(frame_all)
-    frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv"))
+    frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))

    for category in tqdm.tqdm(
            frame_per_category.keys(),
@@ -494,3 +615,5 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->

        figure = fig.get_figure()
        figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)
+
+        plt.close()
--- a/applications/Chat/evaluate/metrics.py
+++ b/applications/Chat/evaluate/metrics.py
--- a/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_en.json
+++ b/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_en.json
+{
+  "id": 1,
+  "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer. You will be given two different answers to the same question",
+  "prompt_template": "[Question]\n{question}\n\n[The Start of AI Assistant 1's Answer]\n{answer_1}\n\n[The End of AI Assistant 1's Answer]\n\n[The Start of AI Assistant 2's Answer]\n{answer_2}\n\n[The End of AI Assistant 2's Answer]\n\n[Requirements]\n{prompt}\n\n",
+  "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."
+}
--- a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
+++ b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_cn.json
-[
-  {
+{
+  "brainstorming": {
    "id": 1,
    "category": "brainstorming",
    "metrics": {
@@ -18,7 +18,7 @@
    },
    "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "chat": {
    "id": 2,
    "category": "chat",
    "metrics": {
@@ -37,7 +37,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的“补全对话”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "classification": {
    "id": 3,
    "category": "classification",
    "metrics": {
@@ -52,7 +52,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的“分类“问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "closed_qa": {
    "id": 4,
    "category": "closed_qa",
    "metrics": {
@@ -67,7 +67,7 @@
    },
    "prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下：\n\n{question}\n\n需要你评分的答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "extraction": {
    "id": 5,
    "category": "extraction",
    "metrics": {
@@ -82,7 +82,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的“提取”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "generation": {
    "id": 6,
    "category": "generation",
    "metrics": {
@@ -97,7 +97,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的“生成”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "open_qa": {
    "id": 7,
    "category": "open_qa",
    "metrics": {
@@ -112,7 +112,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "rewriting": {
    "id": 8,
    "category": "rewriting",
    "metrics": {
@@ -127,7 +127,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "roleplay": {
    "id": 9,
    "category": "roleplay",
    "metrics": {
@@ -144,7 +144,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的“角色扮演”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "summarization": {
    "id": 10,
    "category": "summarization",
    "metrics": {
@@ -161,7 +161,7 @@
    },
    "prompt": "你是一个好助手。请你为下面的“总结”问题的答案打分。\n\n问题如下：\n\n{question}\n\n答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  },
-  {
+  "general": {
    "id": 11,
    "category": "general",
    "metrics": {
@@ -176,4 +176,4 @@
    },
    "prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下：\n\n{question}\n\n需要你评分的答案如下：\n\n{answer}\n\n评分的指标如下：\n\n{metric}\n\n请你遵照以下的评分步骤：\n\n{steps}"
  }
-]
+}
--- a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json
+++ b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json
--- a/applications/Chat/evaluate/requirements.txt
+++ b/applications/Chat/evaluate/requirements.txt
@@ -8,3 +8,5 @@ seaborn
 pandas
 matplotlib
 numpy
+zhon
+rouge_score
--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
--- a/applications/Chat/examples/ray/1mmt_prompt.py
+++ b/applications/Chat/examples/ray/1mmt_prompt.py
--- a/applications/Chat/examples/ray/mmmt_prompt.py
+++ b/applications/Chat/examples/ray/mmmt_prompt.py
--- a/applications/Chat/examples/ray/requirements.txt
+++ b/applications/Chat/examples/ray/requirements.txt
+ray