support evaluation for english (#3880)

Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>

support evaluation for english (#3880)
Co-authored-by: Yuanchen Xu <yuanchen.xu00@gmail.com>
57a6d768 · Yuanchen · GitHub · 18787497 · 57a6d768 · 57a6d768
Unverified Commit 57a6d768 authored Jun 05, 2023 by Yuanchen Committed by GitHub Jun 05, 2023
10 changed files
--- a/applications/Chat/evaluate/README.md
+++ b/applications/Chat/evaluate/README.md
--- a/applications/Chat/evaluate/config/config_en.json
+++ b/applications/Chat/evaluate/config/config_en.json
+{
+  "language": "en",
+  "category": {
+    "brainstorming": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "creativity",
+        "practicality",
+        "correctness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "chat": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "naturalness",
+        "engagingness",
+        "reasonableness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "classification": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Precision",
+        "Recall",
+        "F1 score"
+      ]
+    },
+    "closed_qa": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "extraction": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Precision",
+        "Recall",
+        "F1 score"
+      ]
+    },
+    "generation": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "diversity"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "open_qa": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "rewriting": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    },
+    "roleplay": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "fidelity",
+        "creativity"
+      ],
+      "Metrics": [
+        "Distinct"
+      ]
+    },
+    "summarization": {
+      "GPT": [
+        "language organization",
+        "relevance",
+        "correctness",
+        "conciseness"
+      ],
+      "Metrics": [
+        "BLEU",
+        "ROUGE",
+        "BERTScore"
+      ]
+    }
+  }
+}
--- a/applications/Chat/evaluate/eval.py
+++ b/applications/Chat/evaluate/eval.py
@@ -14,7 +14,7 @@ def main(args):
    # load config
    config = jload(args.config_file)
-    if config["language"] == "cn":
+    if config["language"] in ["cn", "en"]:
        # get metric settings for all categories
        metrics_per_category = {}
        for category in config["category"].keys():

--- a/applications/Chat/evaluate/evaluator.py
+++ b/applications/Chat/evaluate/evaluator.py
@@ -4,7 +4,7 @@ from typing import Any, Dict, List
 import gpt_evaluate
 import metrics
 import pandas as pd
-from utils import get_data_per_category, jdump
+from utils import analyze_automatic_results, get_data_per_category, save_automatic_results
 class Evaluator(object):
@@ -42,21 +42,21 @@ class Evaluator(object):
        """
-        def switch(metric):
+        def switch(metric, language):
            if metric == "BLEU":
-                return metrics.bleu_score(preds=predicts_list, targets=targets_list)
+                return metrics.bleu_score(preds=predicts_list, targets=targets_list, language=language)
            elif metric == "ROUGE":
-                return metrics.rouge_cn_score(preds=predicts_list, targets=targets_list)
+                return metrics.rouge_score(preds=predicts_list, targets=targets_list, language=language)
            elif (metric == "Distinct"):
-                return metrics.distinct_score(preds=predicts_list)
+                return metrics.distinct_score(preds=predicts_list, language=language)
            elif (metric == "BERTScore"):
-                return metrics.bert_score(preds=predicts_list, targets=targets_list)
+                return metrics.bert_score(preds=predicts_list, targets=targets_list, language=language)
            elif (metric == "Precision"):
-                return metrics.precision(preds=predicts_list, targets=targets_list)
+                return metrics.precision(preds=predicts_list, targets=targets_list, language=language)
            elif (metric == "Recall"):
-                return metrics.recall(preds=predicts_list, targets=targets_list)
+                return metrics.recall(preds=predicts_list, targets=targets_list, language=language)
            elif (metric == "F1 score"):
-                return metrics.F1_score(preds=predicts_list, targets=targets_list)
+                return metrics.F1_score(preds=predicts_list, targets=targets_list, language=language)
            else:
                raise ValueError(f"Unexpected metric")
@@ -78,7 +78,7 @@ class Evaluator(object):
            predicts_list = [answer["output"] for answer in answers_per_category[category]]
            for metric in category_metrics:
-                self.automatic_metric_stats[category].update(switch(metric=metric))
+                self.automatic_metric_stats[category].update(switch(metric=metric, language=self.language))
        # gpt evaluation
        for category in self.params:
@@ -106,35 +106,29 @@ class Evaluator(object):
            save_path = os.path.join(path, "gpt_evaluate", "battle_results")
            gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
        else:
-            # save evaluation results for automatic metrics
+            # Save evaluation results for automatic metrics
-            automatic_df = pd.DataFrame(self.automatic_metric_stats)
+            automatic_base_save_path = os.path.join(path, "automatic_results")
+            automatic_results_save_path = os.path.join(automatic_base_save_path, "evaluation_results")
-            automatic_results_save_path = os.path.join(path, "automatic_results")
+            save_automatic_results(model_name_list[0], self.automatic_metric_stats, automatic_results_save_path)
-            if not os.path.exists(automatic_results_save_path):
-                os.makedirs(automatic_results_save_path)
-            automatic_df.to_csv(os.path.join(automatic_results_save_path, f"{model_name_list[0]}.csv"), index=True)
-            # Save evaluation results for GPT-3.5 evaluation metrics.
+            # Save charts and csv.
-            all_evaluations = []
+            automatic_analyses_save_path = os.path.join(automatic_base_save_path, "evaluation_analyses")
-            base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
+            analyze_automatic_results(automatic_results_save_path, automatic_analyses_save_path)
-            evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
-            for category, evaluations in self.gpt_evaluation_results.items():
+            # Save evaluation results for GPT evaluation metrics.
-                jdump(
+            gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
-                    evaluations,
+            gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
-                    os.path.join(evaluation_results_save_path, model_name_list[0],
-                                 f"{category}_evaluation_results.json"))
-                all_evaluations.extend(evaluations)
-            jdump(all_evaluations,
+            all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0], self.gpt_evaluation_results,
-                  os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json"))
+                                                                       gpt_evaluation_results_save_path)
            # Start to calculate scores and save statistics.
-            evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
+            gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
            gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
-                                                        evaluation_statistics_save_path)
+                                                        gpt_evaluation_statistics_save_path)
            # Save charts and csv.
-            evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
+            gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
-            gpt_evaluate.analyze_gpt_evaluation_statistics(evaluation_statistics_save_path,
+            gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
-                                                           evaluation_analyses_save_path)
+                                                           gpt_evaluation_analyses_save_path)
--- a/applications/Chat/evaluate/gpt_evaluate.py
+++ b/applications/Chat/evaluate/gpt_evaluate.py
@@ -461,6 +461,27 @@ def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) ->
        return 0
+def save_gpt_evaluation_results(model_name: str, gpt_evaluation_results: Dict[str, Any],
+                                save_path: str) -> Dict[str, Any]:
+    """
+    Save evaluation results for different categories for one model.
+    Args:
+        model_name: name of the model for saving evaluation results.
+        gpt_evaluation_results: evaluations results for all of the model answers.
+        save_path: path to save GPT evaluation statistics.
+    """
+    all_evaluations = []
+    for category, evaluations in gpt_evaluation_results.items():
+        jdump(evaluations, os.path.join(save_path, model_name, f"{category}_evaluation_results.json"))
+        all_evaluations.extend(evaluations)
+    jdump(all_evaluations, os.path.join(save_path, f"{model_name}_evaluation_results.json"))
+    return all_evaluations
 def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
    """
    Generate statistics for one model.
@@ -468,7 +489,7 @@ def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], sav
    Args:
        model_name: name of the model for saving statistics.
        evaluations: evaluations for all of the model answers.
-        save_path: path to save GPT-3.5 evaluation statistics.
+        save_path: path to save GPT evaluation statistics.
    """
    if not os.path.exists(save_path):
@@ -516,7 +537,7 @@ def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], sav
 def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
    """
-    Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
+    Analyze and visualize all GPT evaluation statistics in the given directory.
    Args:
        statistics_path: path to all the models' statistics.
@@ -594,3 +615,5 @@ def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> N
        figure = fig.get_figure()
        figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)
+        plt.close()
--- a/applications/Chat/evaluate/metrics.py
+++ b/applications/Chat/evaluate/metrics.py
 import statistics
+from typing import Dict, List
 import jieba
 from bert_score import score
 from nltk.translate.bleu_score import sentence_bleu
 from rouge_chinese import Rouge as Rouge_cn
+from rouge_score import rouge_scorer as Rouge_en
 from sklearn.metrics import f1_score, precision_score, recall_score
+from utils import preprocessing_text, remove_redundant_space
-def bleu_score(preds: list, targets: list) -> dict:
+def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
    """Calculate BLEU Score Metric
    The calculation includes BLEU-1 for unigram, BLEU-2 for bigram,
@@ -21,8 +24,12 @@ def bleu_score(preds: list, targets: list) -> dict:
               (1. / 4., 1. / 4., 1. / 4., 1. / 4.)]
    for pred, target in zip(preds, targets):
-        pred_list = (' '.join(jieba.cut(pred))).split()
+        if language == "cn":
-        target_list = [(' '.join(jieba.cut(target))).split()]
+            pred_list = ' '.join(jieba.cut(preprocessing_text(pred))).split()
+            target_list = [(' '.join(jieba.cut(preprocessing_text(target)))).split()]
+        elif language == "en":
+            pred_list = preprocessing_text(pred).split()
+            target_list = [preprocessing_text(target).split()]
        bleu = sentence_bleu(target_list, pred_list, weights=weights)
        cumulative_bleu = [a + b for a, b in zip(cumulative_bleu, bleu)]
@@ -33,7 +40,7 @@ def bleu_score(preds: list, targets: list) -> dict:
    return bleu_scores
-def rouge_cn_score(preds: list, targets: list) -> dict:
+def rouge_cn_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
    """Calculate Chinese ROUGE Score Metric
    The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
@@ -41,13 +48,13 @@ def rouge_cn_score(preds: list, targets: list) -> dict:
    the preds and targets. ROUGE-L measures the number of matching
    longest common subsequence (LCS) between preds and targets.
    """
-    rouge_scores = {"rouge1": {}, "rouge2": {}, "rougeL": {}}
+    rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
    all_preds = []
    all_targets = []
    for pred, target in zip(preds, targets):
-        pred_list = ' '.join(jieba.cut(pred))
+        pred_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(pred))))
-        target_list = ' '.join(jieba.cut(target))
+        target_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(target))))
        all_preds.append(pred_list)
        all_targets.append(target_list)
@@ -61,7 +68,42 @@ def rouge_cn_score(preds: list, targets: list) -> dict:
    return rouge_scores
-def distinct_score(preds: list) -> dict:
+def rouge_en_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
+    """Calculate English ROUGE Score Metric
+    The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
+    and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
+    the preds and targets. ROUGE-L measures the number of matching
+    longest common subsequence (LCS) between preds and targets.
+    """
+    rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
+    all_preds = []
+    all_targets = []
+    rouge_en = Rouge_en.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
+    for pred, target in zip(preds, targets):
+        score = rouge_en.score(preprocessing_text(pred), preprocessing_text(target))
+        rouge_scores["rouge1"] += score['rouge1'].fmeasure
+        rouge_scores["rouge2"] += score['rouge2'].fmeasure
+        rouge_scores["rougeL"] += score['rougeL'].fmeasure
+    rouge_scores["rouge1"] = rouge_scores["rouge1"] / len(preds)
+    rouge_scores["rouge2"] = rouge_scores["rouge2"] / len(preds)
+    rouge_scores["rougeL"] = rouge_scores["rougeL"] / len(preds)
+    return rouge_scores
+def rouge_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
+    """Calculate ROUGE Score Metric"""
+    if language == "cn":
+        return rouge_cn_score(preds, targets)
+    elif language == "en":
+        return rouge_en_score(preds, targets)
+def distinct_score(preds: List[str], language: str) -> Dict[str, float]:
    """Calculate Distinct Score Metric
    This metric refers to https://arxiv.org/abs/1510.03055.
@@ -72,19 +114,36 @@ def distinct_score(preds: list) -> dict:
    cumulative_distinct = []
    for pred in preds:
-        pred_seg_list = list(' '.join(jieba.cut(pred)))
+        if language == "cn":
-        count_segs = len(pred_seg_list)
+            pred_seg_list = ' '.join(jieba.cut(pred)).split()
-        unique_segs = set(pred_seg_list)
+            count_segs = len(pred_seg_list)
-        count_unique_chars = len(unique_segs)
+            unique_segs = set(pred_seg_list)
+            count_unique_chars = len(unique_segs)
-        cumulative_distinct.append(count_unique_chars / count_segs)
+            cumulative_distinct.append(count_unique_chars / count_segs)
+        elif language == "en":
+            # calculate distinct 1-gram, 2-gram, 3-gram
+            unique_ngram = [set() for _ in range(0, 3)]
+            all_ngram_count = [0 for _ in range(0, 3)]
+            split_pred = preprocessing_text(pred).split()
+            for n in range(0, 3):
+                for i in range(0, len(split_pred) - n):
+                    ngram = ' '.join(split_pred[i:i + n + 1])
+                    unique_ngram[n].add(ngram)
+                    all_ngram_count[n] += 1
+            # Sometimes the answer may contain only one word. For 2-gram and 3-gram, the gram count(denominator) may be zero.
+            avg_distinct = [len(a) / (b + 1e-6) for a, b in zip(unique_ngram, all_ngram_count)]
+            cumulative_distinct.append(statistics.mean(avg_distinct))
    distinct_score["distinct"] = statistics.mean(cumulative_distinct)
    return distinct_score
-def bert_score(preds: list, targets: list) -> dict:
+def bert_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
    """Calculate BERTScore Metric
    The BERTScore evaluates the semantic similarity between
@@ -95,23 +154,25 @@ def bert_score(preds: list, targets: list) -> dict:
    target_list = []
    for pred, target in zip(preds, targets):
-        pred_list.append(' '.join(jieba.cut(pred)))
+        pred_list.append(pred)
-        target_list.append(' '.join(jieba.cut(target)))
+        target_list.append(target)
-    _, _, F = score(pred_list, target_list, lang="zh", verbose=True)
+    if language == "cn":
+        _, _, F = score(pred_list, target_list, lang="zh", verbose=True)
+    elif language == "en":
+        _, _, F = score(pred_list, target_list, lang="en", verbose=True)
    bert_score["bert_score"] = F.mean().item()
    return bert_score
-def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
+def calculate_precision_recall_f1(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
    """Precision, Recall and F1-Score Calculation
    The calculation of precision, recall and f1-score is realized by counting
    the number f overlaps between the preds and target. The comparison length
-    limited by the shorter one of preds and targets. This design is mainly
+    limited by the shorter one of preds and targets.
-    considered for classification and extraction categories.
    """
    precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0}
    precision_scores = []
@@ -119,8 +180,12 @@ def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
    f1_scores = []
    for pred, target in zip(preds, targets):
-        pred_list = [char for char in pred]
+        if language == "cn":
-        target_list = [char for char in target]
+            pred_list = [char for char in ' '.join(jieba.cut(preprocessing_text(pred))).split()]
+            target_list = [char for char in ' '.join(jieba.cut(preprocessing_text(target))).split()]
+        elif language == "en":
+            pred_list = [char for char in preprocessing_text(pred).split()]
+            target_list = [char for char in preprocessing_text(target).split()]
        target_labels = [1] * min(len(target_list), len(pred_list))
        pred_labels = [int(pred_list[i] == target_list[i]) for i in range(0, min(len(target_list), len(pred_list)))]
@@ -136,34 +201,31 @@ def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
    return precision_recall_f1
-def precision(preds: list, targets: list) -> dict:
+def precision(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
    """Calculate Precision Metric
-    (design for classification and extraction categories)
    Calculating precision by counting the number of overlaps between the preds and target.
    """
    precision = {"precision": 0}
-    precision["precision"] = calculate_precision_recall_f1(preds, targets)["precision"]
+    precision["precision"] = calculate_precision_recall_f1(preds, targets, language)["precision"]
    return precision
-def recall(preds: list, targets: list) -> dict:
+def recall(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
    """Calculate Recall Metric
-    (design for classification and extraction categories)
    Calculating recall by counting the number of overlaps between the preds and target.
    """
    recall = {"recall": 0}
-    recall["recall"] = calculate_precision_recall_f1(preds, targets)["recall"]
+    recall["recall"] = calculate_precision_recall_f1(preds, targets, language)["recall"]
    return recall
-def F1_score(preds: list, targets: list) -> dict:
+def F1_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
    """Calculate F1-score Metric
-    (design for classification and extraction categories)
    Calculating f1-score by counting the number of overlaps between the preds and target.
    """
    f1 = {"f1_score": 0}
-    f1["f1_score"] = calculate_precision_recall_f1(preds, targets)["f1_score"]
+    f1["f1_score"] = calculate_precision_recall_f1(preds, targets, language)["f1_score"]
    return f1
--- a/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_en.json
+++ b/applications/Chat/evaluate/prompt/battle_prompt/battle_prompt_en.json
+{
+  "id": 1,
+  "system_prompt": "You are a helpful and precise assistant for checking the quality of the answer. You will be given two different answers to the same question",
+  "prompt_template": "[Question]\n{question}\n\n[The Start of AI Assistant 1's Answer]\n{answer_1}\n\n[The End of AI Assistant 1's Answer]\n\n[The Start of AI Assistant 2's Answer]\n{answer_2}\n\n[The End of AI Assistant 2's Answer]\n\n[Requirements]\n{prompt}\n\n",
+  "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."
+}
--- a/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json
+++ b/applications/Chat/evaluate/prompt/evaluation_prompt/evaluation_prompt_en.json
--- a/applications/Chat/evaluate/requirements.txt
+++ b/applications/Chat/evaluate/requirements.txt
@@ -8,3 +8,5 @@ seaborn
 pandas
 matplotlib
 numpy
+zhon
+rouge_score
--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
 import io
 import json
 import os
+import re
+import string
+from typing import Dict
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+import tqdm
+from zhon import hanzi
 def _make_w_io_base(f, mode: str):
@@ -29,7 +38,7 @@ def jdump(obj, f, mode="w", indent=4, default=str):
    """
    f = _make_w_io_base(f, mode)
    if isinstance(obj, (dict, list)):
-        json.dump(obj, f, indent=indent, default=default)
+        json.dump(obj, f, indent=indent, default=default, ensure_ascii=False)
    elif isinstance(obj, str):
        f.write(obj)
    else:
@@ -61,3 +70,149 @@ def get_data_per_category(data, categories):
            data_per_category[category].append(item)
    return data_per_category
+def remove_articles(text: str) -> str:
+    """
+    Remove articles "a, an, the" in the given text.
+    It is used in evaluation of automatic metrics.
+    """
+    pattern = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+    return re.sub(pattern, " ", text)
+def remove_punctuations(text: str) -> str:
+    """
+    Remove punctuations in the given text.
+    It is used in evaluation of automatic metrics.
+    """
+    punctuation = string.punctuation + hanzi.punctuation
+    punctuation = set([char for char in punctuation])
+    punctuation.difference_update(set("!@#$%&()<>?|,.\"'"))
+    out = []
+    for char in text:
+        if char in punctuation:
+            continue
+        else:
+            out.append(char)
+    return "".join(out)
+def remove_redundant_space(text: str) -> str:
+    """
+    Remove redundant spaces in the given text.
+    It is used in evaluation of automatic metrics.
+    """
+    return " ".join(text.split())
+def preprocessing_text(text: str) -> str:
+    """
+    Preprocess the given text.
+    It is used in evaluation of automatic metrics.
+    """
+    return remove_redundant_space(remove_articles(remove_punctuations(text.lower())))
+def save_automatic_results(model_name: str, automatic_metric_stats: Dict[str, Dict], save_path: str) -> None:
+    """
+    Save automatic evaluation results of different categories for one model.
+    """
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    automatic_df = pd.DataFrame(automatic_metric_stats)
+    automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True)
+def read_automatic_results(results_path: str, file_name: str) -> Dict[str, Dict]:
+    """
+    Read a csv file and return a dictionary which stores scores per metric.
+    """
+    results = pd.read_csv(os.path.join(results_path, file_name), index_col=0)
+    results_dict = {metric: {} for metric in list(results.index)}
+    for i, metric in enumerate(results_dict.keys()):
+        for j, category in enumerate(list(results.columns)):
+            if pd.isnull(results.iloc[i][j]):
+                continue
+            results_dict[metric][category] = results.iloc[i][j]
+    return results_dict
+def analyze_automatic_results(results_path: str, save_path: str) -> None:
+    """
+    Analyze and visualize all csv files in the given folder.
+    """
+    if not os.path.exists(results_path):
+        raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!')
+    all_statistics = {}
+    for file_name in os.listdir(results_path):
+        if file_name.endswith("_results.csv"):
+            model_name = file_name.split("_results.csv")[0]
+            all_statistics[model_name] = read_automatic_results(results_path, file_name)
+    if len(list(all_statistics.keys())) == 0:
+        raise Exception(f'There are no csv files in the given directory "{results_path}"!')
+    frame_all = {"model": [], "category": [], "metric": [], "score": []}
+    frame_per_metric = {}
+    for model_name, model_statistics in all_statistics.items():
+        for metric, metric_statistics in model_statistics.items():
+            if frame_per_metric.get(metric) is None:
+                frame_per_metric[metric] = {"model": [], "category": [], "score": []}
+            for category, category_score in metric_statistics.items():
+                frame_all["model"].append(model_name)
+                frame_all["category"].append(category)
+                frame_all["metric"].append(metric)
+                frame_all["score"].append(category_score)
+                frame_per_metric[metric]["model"].append(model_name)
+                frame_per_metric[metric]["category"].append(category)
+                frame_per_metric[metric]["score"].append(category_score)
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    frame_all = pd.DataFrame(frame_all)
+    frame_all.to_csv(os.path.join(save_path, "automatic_evaluation_statistics.csv"))
+    for metric in tqdm.tqdm(
+            frame_per_metric.keys(),
+            desc=f"metric: ",
+            total=len(frame_per_metric.keys()),
+    ):
+        data = pd.DataFrame(frame_per_metric[metric])
+        sns.set()
+        fig = plt.figure(figsize=(16, 10))
+        fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True)
+        fig.set_title(f"Comparison between Different Models for Metric {metric.title()}")
+        plt.xlabel("Evaluation Category")
+        plt.ylabel("Score")
+        figure = fig.get_figure()
+        figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400)
+        plt.close()