[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786)

* Add ColossalEval * Delete evaluate in Chat --------- Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com>

[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786)
* Add ColossalEval * Delete evaluate in Chat --------- Co-authored-by: Xu Yuanchen <yuanchen.xu00@gmail.com> Co-authored-by: Tong Li <tong.li352711588@gmail.com>
ce777853 · Yuanchen · GitHub · 74aa7d96 · 74aa7d96 · 74aa7d96
Unverified Commit ce777853 authored Sep 24, 2023 by Yuanchen Committed by GitHub Sep 24, 2023
20 changed files
--- a/applications/Chat/evaluate/README.md
+++ b/applications/Chat/evaluate/README.md
--- a/applications/Chat/evaluate/config/config_cn.json
+++ b/applications/Chat/evaluate/config/config_cn.json
-{
-  "language": "cn",
-  "category": {
-    "brainstorming": {
-      "GPT": [
-        "language organization",
-        "relevance",
-        "creativity",
-        "practicality",
-        "reasonableness"
-      ],
-      "Metrics": [
-        "Distinct"
-      ]
-    },
-    "chat": {
-      "GPT": [
-        "language organization",
-        "naturalness",
-        "engagingness",
-        "fidelity"
-      ],
-      "Metrics": [
-        "Distinct"
-      ]
-    },
-    "classification": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-        "Precision",
-        "Recall",
-        "F1 score",
-        "CHRF"
-      ]
-    },
-    "closed_qa": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-        "BLEU",
-        "ROUGE",
-        "BERTScore",
-        "CHRF"
-      ]
-    },
-    "extraction": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-        "Precision",
-        "Recall",
-        "F1 score",
-        "CHRF"
-      ]
-    },
-    "generation": {
-      "GPT": [
-        "language organization",
-        "relevance",
-        "diversity"
-      ],
-      "Metrics": [
-        "BLEU",
-        "ROUGE",
-        "BERTScore"
-      ]
-    },
-    "logical_reasoning": {
-      "GPT": [
-        "correctness",
-        "relevance",
-        "reasonableness"
-      ],
-      "Metrics": [
-        "BLEU",
-        "ROUGE",
-        "BERTScore",
-        "CHRF"
-      ]
-    },
-    "open_qa": {
-      "GPT": [
-        "language organization",
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-        "Distinct"
-      ]
-    },
-    "rewriting": {
-      "GPT": [
-        "language organization",
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-        "BLEU",
-        "ROUGE",
-        "BERTScore"
-      ]
-    },
-    "roleplay": {
-      "GPT": [
-        "language organization",
-        "relevance",
-        "fidelity",
-        "creativity"
-      ],
-      "Metrics": [
-        "Distinct"
-      ]
-    },
-    "summarization": {
-      "GPT": [
-        "language organization",
-        "relevance",
-        "correctness",
-        "conciseness"
-      ],
-      "Metrics": [
-      ]
-    },
-    "Finance": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ]
-    },
-    "Law": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ]
-    },
-    "Education": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ]
-    },
-    "Medical": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ]
-    },
-    "STEM": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ]
-    },
-    "SocialScience": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ]
-    },
-    "Humanity": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ]
-    },
-    "Other": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ]
-    },
-    "ethics": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ]
-    }
-  }
-}
--- a/applications/Chat/evaluate/config/config_en.json
+++ b/applications/Chat/evaluate/config/config_en.json
-{
-  "language": "en",
-  "path_for_UniEval": {
-    "summarization": "path to unieval-sum",
-    "dialogue": "path to unieval-dialog",
-    "data2text": "path to unieval-sum"
-  },
-  "category": {
-    "brainstorming": {
-      "GPT": [
-        "language organization",
-        "relevance",
-        "creativity",
-        "practicality",
-        "reasonableness"
-      ],
-      "Metrics": [
-        "Distinct"
-      ],
-      "UniEval": [
-        "summarization-fluency",
-        "data2text-naturalness",
-        "data2text-informativeness"
-      ]
-    },
-    "chat": {
-      "GPT": [
-        "language organization",
-        "naturalness",
-        "engagingness",
-        "fidelity"
-      ],
-      "Metrics": [
-        "Distinct"
-      ],
-      "UniEval": [
-        "summarization-fluency",
-        "dialogue-naturalness",
-        "dialogue-coherence",
-        "dialogue-understandability",
-        "data2text-naturalness",
-        "data2text-informativeness"
-      ]
-    },
-    "classification": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-        "Precision",
-        "Recall",
-        "F1 score",
-        "CHRF"
-      ],
-      "UniEval": [
-        "summarization-fluency",
-        "data2text-naturalness",
-        "data2text-informativeness"
-      ]
-    },
-    "closed_qa": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-        "BLEU",
-        "ROUGE",
-        "BERTScore",
-        "CHRF"
-      ],
-      "UniEval": [
-        "summarization-fluency",
-        "data2text-naturalness",
-        "data2text-informativeness"
-      ]
-    },
-    "extraction": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-        "Precision",
-        "Recall",
-        "F1 score",
-        "CHRF"
-      ],
-      "UniEval": [
-        "summarization-fluency",
-        "data2text-naturalness",
-        "data2text-informativeness"
-      ]
-    },
-    "generation": {
-      "GPT": [
-        "language organization",
-        "relevance",
-        "diversity"
-      ],
-      "Metrics": [
-        "BLEU",
-        "ROUGE",
-        "BERTScore"
-      ],
-      "UniEval": [
-        "summarization-fluency",
-        "data2text-naturalness",
-        "data2text-informativeness"
-      ]
-    },
-    "logical_reasoning": {
-      "GPT": [
-        "correctness",
-        "relevance",
-        "reasonableness"
-      ],
-      "Metrics": [
-        "BLEU",
-        "ROUGE",
-        "BERTScore",
-        "CHRF"
-      ],
-      "UniEval": [
-      ]
-    },
-    "open_qa": {
-      "GPT": [
-        "language organization",
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-        "Distinct"
-      ],
-      "UniEval": [
-        "summarization-fluency",
-        "data2text-naturalness",
-        "data2text-informativeness"
-      ]
-    },
-    "rewriting": {
-      "GPT": [
-        "language organization",
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-        "BLEU",
-        "ROUGE",
-        "BERTScore"
-      ],
-      "UniEval": [
-        "summarization-fluency",
-        "data2text-naturalness",
-        "data2text-informativeness"
-      ]
-    },
-    "roleplay": {
-      "GPT": [
-        "language organization",
-        "relevance",
-        "fidelity",
-        "creativity"
-      ],
-      "Metrics": [
-        "Distinct"
-      ],
-      "UniEval": [
-        "summarization-fluency",
-        "data2text-naturalness",
-        "data2text-informativeness"
-      ]
-    },
-    "summarization": {
-      "GPT": [
-        "language organization",
-        "relevance",
-        "correctness",
-        "conciseness"
-      ],
-      "Metrics": [
-        "BLEU",
-        "ROUGE",
-        "BERTScore",
-        "CHRF"
-      ],
-      "UniEval": [
-      ]
-    },
-    "Finance": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ],
-      "UniEval": [
-      ]
-    },
-    "Law": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ],
-      "UniEval": [
-      ]
-    },
-    "Education": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ],
-      "UniEval": [
-      ]
-    },
-    "Medical": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ],
-      "UniEval": [
-      ]
-    },
-    "STEM": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ],
-      "UniEval": [
-      ]
-    },
-    "SocialScience": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ],
-      "UniEval": [
-      ]
-    },
-    "Humanity": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ],
-      "UniEval": [
-      ]
-    },
-    "Other": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ],
-      "UniEval": [
-      ]
-    },
-    "ethics": {
-      "GPT": [
-        "relevance",
-        "correctness"
-      ],
-      "Metrics": [
-      ],
-      "UniEval": [
-      ]
-    }
-  }
-}
--- a/applications/Chat/evaluate/metrics.py
+++ b/applications/Chat/evaluate/metrics.py
-import statistics
-from typing import Dict, List
-import jieba
-from bert_score import score
-from nltk.translate.bleu_score import sentence_bleu
-from nltk.translate.chrf_score import sentence_chrf
-from rouge_chinese import Rouge as Rouge_cn
-from rouge_score import rouge_scorer as Rouge_en
-from sklearn.metrics import f1_score, precision_score, recall_score
-from utils import preprocessing_text, remove_redundant_space
-def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
-    """Calculate BLEU Score Metric
-    The calculation includes BLEU-1 for unigram, BLEU-2 for bigram,
-    BLEU-3 for trigram and BLEU-4 for 4-gram. Unigram evaluates the
-    accuracy in word level, other n-gram evaluate the fluency in
-    sentence level.
-    """
-    bleu_scores = {"bleu1": 0, "bleu2": 0, "bleu3": 0, "bleu4": 0}
-    cumulative_bleu = [0] * 4
-    weights = [
-        (1.0 / 1.0, 0.0, 0.0, 0.0),
-        (1.0 / 2.0, 1.0 / 2.0, 0.0, 0.0),
-        (1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0, 0.0),
-        (1.0 / 4.0, 1.0 / 4.0, 1.0 / 4.0, 1.0 / 4.0),
-    ]
-    for pred, target in zip(preds, targets):
-        if language == "cn":
-            pred_list = " ".join(jieba.cut(preprocessing_text(pred))).split()
-            target_list = [(" ".join(jieba.cut(preprocessing_text(target)))).split()]
-        elif language == "en":
-            pred_list = preprocessing_text(pred).split()
-            target_list = [preprocessing_text(target).split()]
-        bleu = sentence_bleu(target_list, pred_list, weights=weights)
-        cumulative_bleu = [a + b for a, b in zip(cumulative_bleu, bleu)]
-    for i in range(len(cumulative_bleu)):
-        bleu_scores[f"bleu{i+1}"] = cumulative_bleu[i] / len(preds)
-    return bleu_scores
-def chrf_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
-    """Calculate CHRF Score Metric in sentence level."""
-    chrf_score = {"chrf": 0}
-    cumulative_chrf = []
-    for pred, target in zip(preds, targets):
-        if language == "cn":
-            pred_list = " ".join(jieba.cut(preprocessing_text(pred))).split()
-            target_list = " ".join(jieba.cut(preprocessing_text(target))).split()
-        elif language == "en":
-            pred_list = preprocessing_text(pred).split()
-            target_list = preprocessing_text(target).split()
-        cumulative_chrf.append(sentence_chrf(target_list, pred_list))
-    chrf_score["chrf"] = statistics.mean(cumulative_chrf)
-    return chrf_score
-def rouge_cn_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
-    """Calculate Chinese ROUGE Score Metric
-    The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
-    and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
-    the preds and targets. ROUGE-L measures the number of matching
-    longest common subsequence (LCS) between preds and targets.
-    """
-    rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
-    all_preds = []
-    all_targets = []
-    for pred, target in zip(preds, targets):
-        pred_list = remove_redundant_space(" ".join(jieba.cut(preprocessing_text(pred))))
-        target_list = remove_redundant_space(" ".join(jieba.cut(preprocessing_text(target))))
-        all_preds.append(pred_list)
-        all_targets.append(target_list)
-    rouge_cn = Rouge_cn()
-    rouge_avg = rouge_cn.get_scores(all_preds, all_targets, avg=True)
-    rouge_scores["rouge1"] = rouge_avg["rouge-1"]["f"]
-    rouge_scores["rouge2"] = rouge_avg["rouge-2"]["f"]
-    rouge_scores["rougeL"] = rouge_avg["rouge-l"]["f"]
-    return rouge_scores
-def rouge_en_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
-    """Calculate English ROUGE Score Metric
-    The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
-    and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
-    the preds and targets. ROUGE-L measures the number of matching
-    longest common subsequence (LCS) between preds and targets.
-    """
-    rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
-    rouge_en = Rouge_en.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
-    for pred, target in zip(preds, targets):
-        score = rouge_en.score(preprocessing_text(pred), preprocessing_text(target))
-        rouge_scores["rouge1"] += score["rouge1"].fmeasure
-        rouge_scores["rouge2"] += score["rouge2"].fmeasure
-        rouge_scores["rougeL"] += score["rougeL"].fmeasure
-    rouge_scores["rouge1"] = rouge_scores["rouge1"] / len(preds)
-    rouge_scores["rouge2"] = rouge_scores["rouge2"] / len(preds)
-    rouge_scores["rougeL"] = rouge_scores["rougeL"] / len(preds)
-    return rouge_scores
-def rouge_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
-    """Calculate ROUGE Score Metric"""
-    if language == "cn":
-        return rouge_cn_score(preds, targets)
-    elif language == "en":
-        return rouge_en_score(preds, targets)
-def distinct_score(preds: List[str], language: str) -> Dict[str, float]:
-    """Calculate Distinct Score Metric
-    This metric refers to https://arxiv.org/abs/1510.03055.
-    It evaluates the diversity of generation text by counting
-    the unique n-grams.
-    """
-    distinct_score = {"distinct": 0}
-    cumulative_distinct = []
-    for pred in preds:
-        if language == "cn":
-            pred_seg_list = " ".join(jieba.cut(pred)).split()
-            count_segs = len(pred_seg_list)
-            unique_segs = set(pred_seg_list)
-            count_unique_chars = len(unique_segs)
-            # prevent denominator from being 0
-            cumulative_distinct.append(count_unique_chars / (count_segs + 1e-6))
-        elif language == "en":
-            # calculate distinct 1-gram, 2-gram, 3-gram
-            unique_ngram = [set() for _ in range(0, 3)]
-            all_ngram_count = [0 for _ in range(0, 3)]
-            split_pred = preprocessing_text(pred).split()
-            for n in range(0, 3):
-                for i in range(0, len(split_pred) - n):
-                    ngram = " ".join(split_pred[i : i + n + 1])
-                    unique_ngram[n].add(ngram)
-                    all_ngram_count[n] += 1
-            # Sometimes the answer may contain only one word. For 2-gram and 3-gram, the gram count(denominator) may be zero.
-            avg_distinct = [len(a) / (b + 1e-6) for a, b in zip(unique_ngram, all_ngram_count)]
-            cumulative_distinct.append(statistics.mean(avg_distinct))
-    distinct_score["distinct"] = statistics.mean(cumulative_distinct)
-    return distinct_score
-def bert_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
-    """Calculate BERTScore Metric
-    The BERTScore evaluates the semantic similarity between
-    tokens of preds and targets with BERT.
-    """
-    bert_score = {"bert_score": 0}
-    pred_list = []
-    target_list = []
-    for pred, target in zip(preds, targets):
-        pred_list.append(pred)
-        target_list.append(target)
-    if language == "cn":
-        _, _, F = score(pred_list, target_list, lang="zh", verbose=True)
-    elif language == "en":
-        _, _, F = score(pred_list, target_list, lang="en", verbose=True)
-    bert_score["bert_score"] = F.mean().item()
-    return bert_score
-def calculate_precision_recall_f1(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
-    """Precision, Recall and F1-Score Calculation
-    The calculation of precision, recall and f1-score is realized by counting
-    the number f overlaps between the preds and target. The comparison length
-    limited by the shorter one of preds and targets.
-    """
-    precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0}
-    precision_scores = []
-    recall_scores = []
-    f1_scores = []
-    for pred, target in zip(preds, targets):
-        if language == "cn":
-            pred_list = [char for char in " ".join(jieba.cut(preprocessing_text(pred))).split()]
-            target_list = [char for char in " ".join(jieba.cut(preprocessing_text(target))).split()]
-        elif language == "en":
-            pred_list = [char for char in preprocessing_text(pred).split()]
-            target_list = [char for char in preprocessing_text(target).split()]
-        target_labels = [1] * min(len(target_list), len(pred_list))
-        pred_labels = [int(pred_list[i] == target_list[i]) for i in range(0, min(len(target_list), len(pred_list)))]
-        precision_scores.append(precision_score(target_labels, pred_labels, zero_division=0))
-        recall_scores.append(recall_score(target_labels, pred_labels, zero_division=0))
-        f1_scores.append(f1_score(target_labels, pred_labels, zero_division=0))
-    precision_recall_f1["precision"] = statistics.mean(precision_scores)
-    precision_recall_f1["recall"] = statistics.mean(recall_scores)
-    precision_recall_f1["f1_score"] = statistics.mean(f1_scores)
-    return precision_recall_f1
-def precision(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
-    """Calculate Precision Metric
-    Calculating precision by counting the number of overlaps between the preds and target.
-    """
-    precision = {"precision": 0}
-    precision["precision"] = calculate_precision_recall_f1(preds, targets, language)["precision"]
-    return precision
-def recall(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
-    """Calculate Recall Metric
-    Calculating recall by counting the number of overlaps between the preds and target.
-    """
-    recall = {"recall": 0}
-    recall["recall"] = calculate_precision_recall_f1(preds, targets, language)["recall"]
-    return recall
-def F1_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
-    """Calculate F1-score Metric
-    Calculating f1-score by counting the number of overlaps between the preds and target.
-    """
-    f1 = {"f1_score": 0}
-    f1["f1_score"] = calculate_precision_recall_f1(preds, targets, language)["f1_score"]
-    return f1
--- a/applications/Chat/evaluate/unieval/__init__.py
+++ b/applications/Chat/evaluate/unieval/__init__.py
-from .evaluator import get_evaluator
-from .utils import (
-    analyze_unieval_results,
-    calculate_average_score,
-    convert_data_to_unieval_format,
-    save_unieval_results,
-)
-__all__ = [
-    "get_evaluator",
-    "convert_data_to_unieval_format",
-    "calculate_average_score",
-    "save_unieval_results",
-    "analyze_unieval_results",
-]
--- a/applications/Chat/evaluate/unieval/evaluator.py
+++ b/applications/Chat/evaluate/unieval/evaluator.py
-# MIT License
-# Copyright (c) 2022 Ming Zhong
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import numpy as np
-from nltk import sent_tokenize
-from .scorer import UniEvaluator
-from .utils import add_question
-class SumEvaluator:
-    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
-        """Set up evaluator for text summarization"""
-        self.scorer = UniEvaluator(
-            model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
-            max_length=max_length,
-            device=device,
-            cache_dir=cache_dir,
-        )
-        self.task = "summarization"
-        self.dimensions = ["coherence", "consistency", "fluency", "relevance"]
-    def evaluate(self, data, category, dims=None, overall=True):
-        """
-        Get the scores of all the given dimensions
-        category: The category to be evaluated.
-        dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
-              four dimensions: coherence, consistency, fluency, relevance.
-        overall: indicates whether the overall score is to be calculated.
-                 Overall score can be customized to a combination of scores based on different
-                 dimensions. The default here is the average score of all the given dimensions.
-        """
-        n_data = len(data)
-        eval_scores = [{} for _ in range(n_data)]
-        if dims == None:
-            eval_dims = self.dimensions
-        else:
-            assert isinstance(dims, list)
-            eval_dims = dims
-        for dim in eval_dims:
-            # Calculate average sentence-level scores for 'consistency' and 'fluency'
-            if dim == "consistency" or dim == "fluency":
-                src_list, output_list = [], []
-                n_sents = []  # the number of sentences in each generated summary
-                for i in range(n_data):
-                    source = data[i]["source"]
-                    system_outputs = sent_tokenize(data[i]["system_output"])
-                    n_sents.append(len(system_outputs))
-                    for j in range(len(system_outputs)):
-                        src_list.append(source)
-                        output_list.append(system_outputs[j])
-                input_list = add_question(dimension=dim, output=output_list, src=src_list, task=self.task)
-                sent_score = self.scorer.score(input_list, self.task, category, dim)
-                # Get average score for each sample
-                start_idx = 0
-                score = []
-                for cur_n_sent in n_sents:
-                    # prevent denominator from being 0
-                    score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / (cur_n_sent + 1e-6))
-                    start_idx += cur_n_sent
-            # Calculate summary-level score for 'coherence' and 'relevance'
-            elif dim == "coherence" or dim == "relevance":
-                src_list, output_list, ref_list = [], [], []
-                for i in range(n_data):
-                    src_list.append(data[i]["source"])
-                    output_list.append(data[i]["system_output"])
-                    if dim == "relevance":
-                        ref_list.append(data[i]["reference"])
-                input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task)
-                score = self.scorer.score(input_list, self.task, category, dim)
-            # Please customize other dimensions here for summarization
-            else:
-                raise NotImplementedError(
-                    "The input format for this dimension is still undefined. \
-                                           Please customize it first."
-                )
-            for i in range(n_data):
-                eval_scores[i][dim] = score[i]
-        # Customize your overall score here.
-        if overall == True:
-            for i in range(n_data):
-                eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
-        return eval_scores
-class DialogEvaluator:
-    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
-        """Set up evaluator for dialogues"""
-        self.scorer = UniEvaluator(
-            model_name_or_path="MingZhong/unieval-dialog" if model_name_or_path == "" else model_name_or_path,
-            max_length=max_length,
-            device=device,
-            cache_dir=cache_dir,
-        )
-        self.task = "dialogue"
-        self.dimensions = ["naturalness", "coherence", "engagingness", "groundedness", "understandability"]
-    def evaluate(self, data, category, dims=None, overall=True):
-        """
-        Get the scores of all the given dimensions
-        category: The category to be evaluated.
-        dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
-              five dimensions: naturalness, coherence, engagingness, groundedness and understandability.
-        overall: indicates whether the overall score is to be calculated.
-                 Overall score can be customized to a combination of scores based on different
-                 dimensions. The default here is the average score of all the given dimensions.
-        """
-        n_data = len(data)
-        eval_scores = [{} for _ in range(n_data)]
-        if dims == None:
-            eval_dims = self.dimensions
-        else:
-            assert isinstance(dims, list)
-            eval_dims = dims
-        for dim in eval_dims:
-            # Calculate summation score for 'engagingness'
-            if dim == "engagingness":
-                src_list, output_list, context_list = [], [], []
-                n_sents = []  # the number of sentences in each generated response
-                for i in range(n_data):
-                    source = data[i]["source"]
-                    context = data[i]["context"]
-                    system_outputs = sent_tokenize(data[i]["system_output"])
-                    n_sents.append(len(system_outputs))
-                    for j in range(len(system_outputs)):
-                        src_list.append(source)
-                        context_list.append(context)
-                        output_list.append(system_outputs[j])
-                input_list = add_question(
-                    dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
-                )
-                sent_score = self.scorer.score(input_list, self.task, category, dim)
-                # Get the summation score for each sample
-                start_idx = 0
-                score = []
-                for cur_n_sent in n_sents:
-                    score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]))
-                    start_idx += cur_n_sent
-            # Calculate turn-level score for other dimensions
-            elif dim in ["naturalness", "coherence", "groundedness", "understandability"]:
-                src_list, output_list, context_list = [], [], []
-                for i in range(n_data):
-                    src_list.append(data[i]["source"])
-                    output_list.append(data[i]["system_output"])
-                    context_list.append(data[i]["context"])
-                input_list = add_question(
-                    dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
-                )
-                score = self.scorer.score(input_list, self.task, category, dim)
-            # Please customize other dimensions here for summarization
-            else:
-                raise NotImplementedError(
-                    "The input format for this dimension is still undefined. \
-                                           Please customize it first."
-                )
-            for i in range(n_data):
-                eval_scores[i][dim] = score[i]
-        # Customize your overall score here.
-        if overall == True:
-            for i in range(n_data):
-                eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
-        return eval_scores
-class D2tEvaluator:
-    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
-        """Set up evaluator for data-to-text"""
-        self.scorer = UniEvaluator(
-            model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
-            max_length=max_length,
-            device=device,
-            cache_dir=cache_dir,
-        )
-        self.task = "data2text"
-        self.dimensions = ["naturalness", "informativeness"]
-    def evaluate(self, data, category, dims=None, overall=True):
-        """
-        Get the scores of all the given dimensions
-        category: The category to be evaluated.
-        dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
-              two dimensions: naturalness and informativeness.
-        overall: indicates whether the overall score is to be calculated.
-                 Overall score can be customized to a combination of scores based on different
-                 dimensions. The default here is the average score of all the given dimensions.
-        """
-        n_data = len(data)
-        eval_scores = [{} for _ in range(n_data)]
-        if dims == None:
-            eval_dims = self.dimensions
-        else:
-            assert isinstance(dims, list)
-            eval_dims = dims
-        for dim in eval_dims:
-            output_list, ref_list = [], []
-            for i in range(n_data):
-                output_list.append(data[i]["system_output"])
-                ref_list.append(data[i]["reference"])
-            input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task)
-            score = self.scorer.score(input_list, self.task, category, dim)
-            for i in range(n_data):
-                eval_scores[i][dim] = score[i]
-        # Customize your overall score here.
-        if overall == True:
-            for i in range(n_data):
-                eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
-        return eval_scores
-class FactEvaluator:
-    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
-        """Set up evaluator for factual consistency detection"""
-        self.scorer = UniEvaluator(
-            model_name_or_path="MingZhong/unieval-fact" if model_name_or_path == "" else model_name_or_path,
-            max_length=max_length,
-            device=device,
-            cache_dir=cache_dir,
-        )
-        self.task = "fact"
-        self.dim = "consistency"
-    def evaluate(self, data, category):
-        """
-        Get the factual consistency score (only 1 dimension for this task)
-        category: The category to be evaluated.
-        """
-        n_data = len(data)
-        eval_scores = [{} for _ in range(n_data)]
-        # Calculate average sentence-level scores for factual consistency
-        src_list, output_list = [], []
-        n_sents = []  # the number of sentences in the claim
-        for i in range(n_data):
-            source = data[i]["source"]
-            system_outputs = sent_tokenize(data[i]["system_output"])
-            n_sents.append(len(system_outputs))
-            for j in range(len(system_outputs)):
-                src_list.append(source)
-                output_list.append(system_outputs[j])
-        input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
-        sent_score = self.scorer.score(input_list, self.task, category, self.dim)
-        # Get average score for each sample
-        start_idx = 0
-        score = []
-        for cur_n_sent in n_sents:
-            score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / cur_n_sent)
-            start_idx += cur_n_sent
-        for i in range(n_data):
-            eval_scores[i][self.dim] = score[i]
-        return eval_scores
-def get_evaluator(task, model_name_or_path="", max_length=1024, device="cuda:0", cache_dir=None):
-    assert task in ["summarization", "dialogue", "data2text", "fact"]
-    if task == "summarization":
-        return SumEvaluator(
-            model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
-        )
-    elif task == "dialogue":
-        return DialogEvaluator(
-            model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
-        )
-    elif task == "data2text":
-        return D2tEvaluator(
-            model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
-        )
-    elif task == "fact":
-        return FactEvaluator(
-            model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
-        )
-    else:
-        raise NotImplementedError(
-            "Other tasks are not implemented, \
-                                   please customize specific tasks here."
-        )
--- a/applications/Chat/evaluate/unieval/scorer.py
+++ b/applications/Chat/evaluate/unieval/scorer.py
-# MIT License
-# Copyright (c) 2022 Ming Zhong
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import torch
-import torch.nn as nn
-from tqdm import tqdm
-from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer
-class UniEvaluator:
-    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
-        """Set up model"""
-        self.device = device
-        self.max_length = max_length
-        self.config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, config=self.config, cache_dir=cache_dir)
-        self.model.eval()
-        self.model.to(device)
-        self.softmax = nn.Softmax(dim=1)
-        self.pos_id = self.tokenizer("Yes")["input_ids"][0]
-        self.neg_id = self.tokenizer("No")["input_ids"][0]
-    def score(self, inputs, task, category, dim, batch_size=8):
-        """
-        Get scores for the given samples.
-        final_score = postive_score / (postive_score + negative_score)
-        """
-        # The implementation of "forward" in T5 still requires decoder_input_ids.
-        # Therefore, we construct a random one-word target sequence.
-        # The content of the target has no effect on the final scores.
-        tgts = ["No" for _ in range(len(inputs))]
-        pos_score_list, neg_score_list = [], []
-        for i in tqdm(range(0, len(inputs), batch_size), desc=f"{category}-({dim}-{task}): "):
-            src_list = inputs[i : i + batch_size]
-            tgt_list = tgts[i : i + batch_size]
-            try:
-                with torch.no_grad():
-                    encoded_src = self.tokenizer(
-                        src_list, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt"
-                    )
-                    encoded_tgt = self.tokenizer(
-                        tgt_list, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt"
-                    )
-                    src_tokens = encoded_src["input_ids"].to(self.device)
-                    src_mask = encoded_src["attention_mask"].to(self.device)
-                    tgt_tokens = encoded_tgt["input_ids"].to(self.device)[:, 0].unsqueeze(-1)
-                    output = self.model(input_ids=src_tokens, attention_mask=src_mask, labels=tgt_tokens)
-                    logits = output.logits.view(-1, self.model.config.vocab_size)
-                    pos_score = self.softmax(logits)[:, self.pos_id]  # Yes
-                    neg_score = self.softmax(logits)[:, self.neg_id]  # No
-                    cur_pos_score = [x.item() for x in pos_score]
-                    cur_neg_score = [x.item() for x in neg_score]
-                    pos_score_list += cur_pos_score
-                    neg_score_list += cur_neg_score
-            except RuntimeError:
-                print(f"source: {src_list}")
-                print(f"target: {tgt_list}")
-                exit(0)
-        score_list = []
-        for i in range(len(pos_score_list)):
-            score_list.append(pos_score_list[i] / (pos_score_list[i] + neg_score_list[i]))
-        return score_list
--- a/applications/Chat/evaluate/unieval/utils.py
+++ b/applications/Chat/evaluate/unieval/utils.py
-# MIT License
-# Copyright (c) 2022 Ming Zhong
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import os
-from typing import Dict
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-import tqdm
-def add_question(dimension, output, src=None, ref=None, context=None, task=None):
-    """
-    Add questions to generate input in Bool-QA format for UniEval.
-    dimension: specific dimension to be evaluated
-    src: source input for different NLG tasks. For example, source document for summarization
-         and dialogue history for dialogue response generation.
-    output: output text generated by the models
-    ref: human-annotated groundtruth
-    context: the context needed to evaluate several specific dimension. For example,
-             additional factual information when evaluating engagingness and groundedness in dialogues.
-    """
-    input_with_question = []
-    for i in range(len(output)):
-        # For summarization
-        if task == "summarization":
-            if dimension == "fluency":
-                cur_input = "question: Is this a fluent paragraph? </s> paragraph: " + output[i]
-            elif dimension == "coherence":
-                cur_input = (
-                    "question: Is this a coherent summary to the document? </s> summary: "
-                    + output[i]
-                    + " </s> document: "
-                    + src[i]
-                )
-            elif dimension == "consistency":
-                cur_input = (
-                    "question: Is this claim consistent with the document? </s> claim: "
-                    + output[i]
-                    + " </s> document: "
-                    + src[i]
-                )
-            elif dimension == "relevance":
-                cur_input = (
-                    "question: Is this summary relevant to the reference? </s> summary: "
-                    + output[i]
-                    + " </s> reference: "
-                    + ref[i]
-                )
-            else:
-                raise NotImplementedError(
-                    "The input format for this dimension is still undefined. Please customize it first."
-                )
-        # For dialogues
-        elif task == "dialogue":
-            if dimension == "naturalness":
-                cur_input = "question: Is this a natural response in the dialogue? </s> response: " + output[i]
-            elif dimension == "coherence":
-                cur_input = (
-                    "question: Is this a coherent response given the dialogue history? </s> response: "
-                    + output[i]
-                    + " </s> dialogue history: "
-                    + src[i]
-                )
-            elif dimension == "engagingness":
-                cur_input = (
-                    "question: Is this an engaging and informative response according to the dialogue history and fact? </s> response: "
-                    + output[i]
-                    + " </s> dialogue history: "
-                    + src[i]
-                    + " </s> fact: "
-                    + context[i]
-                )
-            elif dimension == "groundedness":
-                cur_input = (
-                    "question: Is this response consistent with knowledge in the fact? </s> response: "
-                    + output[i]
-                    + " </s> fact: "
-                    + context[i]
-                )
-            elif dimension == "understandability":
-                cur_input = "question: Is this an understandable response in the dialogue? </s> response: " + output[i]
-            else:
-                raise NotImplementedError(
-                    "The input format for this dimension is still undefined. Please customize it first."
-                )
-        # For data-to-text
-        elif task == "data2text":
-            if dimension == "naturalness":
-                cur_input = "question: Is this a fluent utterance? </s> utterance: " + output[i]
-            elif dimension == "informativeness":
-                cur_input = (
-                    "question: Is this sentence informative according to the reference? </s> sentence: "
-                    + output[i]
-                    + " </s> reference: "
-                    + ref[i]
-                )
-            else:
-                raise NotImplementedError(
-                    "The input format for this dimension is still undefined. Please customize it first."
-                )
-        # For factual consistency detection
-        elif task == "fact":
-            if dimension == "consistency":
-                cur_input = (
-                    "question: Is this claim consistent with the document? </s> claim: "
-                    + output[i]
-                    + " </s> document: "
-                    + src[i]
-                )
-            else:
-                raise NotImplementedError("No other dimensions for the factual consistency detection task.")
-        # For new customized tasks
-        else:
-            raise NotImplementedError("Other tasks are not implemented, please customize specific tasks here.")
-        input_with_question.append(cur_input)
-    return input_with_question
-def convert_data_to_unieval_format(output_list, src_list=None, ref_list=None):
-    """
-    Convert the data into the unieval's format.
-    output_list: a list of model output
-    src_list: source input for different NLG tasks. For example, source document for summarization
-              and dialogue history for dialogue response generation
-    ref_list: human-annotated groundtruth
-    """
-    json_data = []
-    for i in range(len(output_list)):
-        cur = {}
-        cur["system_output"] = output_list[i]
-        if src_list is not None:
-            cur["source"] = src_list[i]
-        if ref_list is not None:
-            cur["reference"] = ref_list[i]
-        cur["context"] = ""
-        json_data.append(cur)
-    return json_data
-def calculate_average_score(scores):
-    """
-    Calculate average scores for different metrics
-    scores: a list of scores for different metrics for each answer
-    """
-    metrics = {metric: 0 for metric in scores[0]}
-    for score in scores:
-        for metric in score:
-            metrics[metric] += score[metric]
-    for metric in metrics:
-        metrics[metric] /= len(scores)
-    return metrics
-def save_unieval_results(model_name: str, unieval_metric_stats: Dict[str, Dict], save_path: str) -> None:
-    """
-    Save UniEval evaluation results of different categories for one model.
-    """
-    if not os.path.exists(save_path):
-        os.makedirs(save_path)
-    unieval_metric_stats_per_category = {}
-    for task, category_stat in unieval_metric_stats.items():
-        for category, metric_stat in category_stat.items():
-            if unieval_metric_stats_per_category.get(category, None) is None:
-                unieval_metric_stats_per_category[category] = {}
-            for metric, score in metric_stat.items():
-                unieval_metric_stats_per_category[category][f"{metric}-{task}"] = score
-    automatic_df = pd.DataFrame(unieval_metric_stats_per_category)
-    automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True)
-def read_unieval_results(results_path: str, file_name: str) -> Dict[str, Dict]:
-    """
-    Read a csv file and return a dictionary which stores scores per metric.
-    """
-    results = pd.read_csv(os.path.join(results_path, file_name), index_col=0)
-    results_dict = {metric: {} for metric in list(results.index)}
-    for i, metric in enumerate(results_dict.keys()):
-        for j, category in enumerate(list(results.columns)):
-            if pd.isnull(results.iloc[i][j]):
-                continue
-            results_dict[metric][category] = results.iloc[i][j]
-    return results_dict
-def analyze_unieval_results(results_path: str, save_path: str) -> None:
-    """
-    Analyze and visualize all csv files in the given folder.
-    """
-    if not os.path.exists(results_path):
-        raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!')
-    all_statistics = {}
-    for file_name in os.listdir(results_path):
-        if file_name.endswith("_results.csv"):
-            model_name = file_name.split("_results.csv")[0]
-            all_statistics[model_name] = read_unieval_results(results_path, file_name)
-    if len(list(all_statistics.keys())) == 0:
-        raise Exception(f'There are no csv files in the given directory "{results_path}"!')
-    frame_all = {"model": [], "category": [], "metric": [], "score": []}
-    frame_per_metric = {}
-    for model_name, model_statistics in all_statistics.items():
-        for metric, metric_statistics in model_statistics.items():
-            if frame_per_metric.get(metric) is None:
-                frame_per_metric[metric] = {"model": [], "category": [], "score": []}
-            for category, category_score in metric_statistics.items():
-                frame_all["model"].append(model_name)
-                frame_all["category"].append(category)
-                frame_all["metric"].append(metric)
-                frame_all["score"].append(category_score)
-                frame_per_metric[metric]["model"].append(model_name)
-                frame_per_metric[metric]["category"].append(category)
-                frame_per_metric[metric]["score"].append(category_score)
-    if not os.path.exists(save_path):
-        os.makedirs(save_path)
-    frame_all = pd.DataFrame(frame_all)
-    frame_all.to_csv(os.path.join(save_path, "unieval_statistics.csv"))
-    for metric in tqdm.tqdm(
-        frame_per_metric.keys(),
-        desc=f"UniEval metrics: ",
-        total=len(frame_per_metric.keys()),
-    ):
-        data = pd.DataFrame(frame_per_metric[metric])
-        sns.set()
-        fig = plt.figure(figsize=(16, 10))
-        fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True)
-        fig.set_title(
-            f"Comparison between Different Models for Metric {metric.split('-')[0].title()} in Task {metric.split('-')[1].title()}"
-        )
-        plt.xlabel("Evaluation Category")
-        plt.ylabel("Score")
-        figure = fig.get_figure()
-        figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400)
-        plt.close()
--- a/applications/Chat/evaluate/utils.py
+++ b/applications/Chat/evaluate/utils.py
-import io
-import json
-import os
-import string
-from typing import Dict
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-import tqdm
-from zhon import hanzi
-def _make_w_io_base(f, mode: str):
-    if not isinstance(f, io.IOBase):
-        f_dirname = os.path.dirname(f)
-        if f_dirname != "":
-            os.makedirs(f_dirname, exist_ok=True)
-        f = open(f, mode=mode)
-    return f
-def _make_r_io_base(f, mode: str):
-    if not isinstance(f, io.IOBase):
-        f = open(f, mode=mode)
-    return f
-def jdump(obj, f, mode="w", indent=4, default=str):
-    """Dump a str or dictionary to a file in json format.
-    Args:
-        obj: An object to be written.
-        f: A string path to the location on disk.
-        mode: Mode for opening the file.
-        indent: Indent for storing json dictionaries.
-        default: A function to handle non-serializable entries; defaults to `str`.
-    """
-    f = _make_w_io_base(f, mode)
-    if isinstance(obj, (dict, list)):
-        json.dump(obj, f, indent=indent, default=default, ensure_ascii=False)
-    elif isinstance(obj, str):
-        f.write(obj)
-    else:
-        raise ValueError(f"Unexpected type: {type(obj)}")
-    f.close()
-def jload(f, mode="r"):
-    """Load a .json file into a dictionary."""
-    f = _make_r_io_base(f, mode)
-    jdict = json.load(f)
-    f.close()
-    return jdict
-def get_json_list(file_path):
-    with open(file_path, "r") as f:
-        json_list = []
-        for line in f:
-            json_list.append(json.loads(line))
-        return json_list
-def get_data_per_category(data, categories):
-    data_per_category = {category: [] for category in categories}
-    for item in data:
-        category = item["category"]
-        if category in categories:
-            data_per_category[category].append(item)
-    return data_per_category
-def remove_punctuations(text: str) -> str:
-    """
-    Remove punctuations in the given text.
-    It is used in evaluation of automatic metrics.
-    """
-    punctuation = string.punctuation + hanzi.punctuation
-    punctuation = set([char for char in punctuation])
-    punctuation.difference_update(set("!@#$%&()<>?|,.\"'"))
-    out = []
-    for char in text:
-        if char in punctuation:
-            continue
-        else:
-            out.append(char)
-    return "".join(out)
-def remove_redundant_space(text: str) -> str:
-    """
-    Remove redundant spaces in the given text.
-    It is used in evaluation of automatic metrics.
-    """
-    return " ".join(text.split())
-def preprocessing_text(text: str) -> str:
-    """
-    Preprocess the given text.
-    It is used in evaluation of automatic metrics.
-    """
-    return remove_redundant_space(remove_punctuations(text.lower()))
-def save_automatic_results(model_name: str, automatic_metric_stats: Dict[str, Dict], save_path: str) -> None:
-    """
-    Save automatic evaluation results of different categories for one model.
-    """
-    if not os.path.exists(save_path):
-        os.makedirs(save_path)
-    automatic_df = pd.DataFrame(automatic_metric_stats)
-    automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True)
-def read_automatic_results(results_path: str, file_name: str) -> Dict[str, Dict]:
-    """
-    Read a csv file and return a dictionary which stores scores per metric.
-    """
-    results = pd.read_csv(os.path.join(results_path, file_name), index_col=0)
-    results_dict = {metric: {} for metric in list(results.index)}
-    for i, metric in enumerate(results_dict.keys()):
-        for j, category in enumerate(list(results.columns)):
-            if pd.isnull(results.iloc[i][j]):
-                continue
-            results_dict[metric][category] = results.iloc[i][j]
-    return results_dict
-def analyze_automatic_results(results_path: str, save_path: str) -> None:
-    """
-    Analyze and visualize all csv files in the given folder.
-    """
-    if not os.path.exists(results_path):
-        raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!')
-    all_statistics = {}
-    for file_name in os.listdir(results_path):
-        if file_name.endswith("_results.csv"):
-            model_name = file_name.split("_results.csv")[0]
-            all_statistics[model_name] = read_automatic_results(results_path, file_name)
-    if len(list(all_statistics.keys())) == 0:
-        raise Exception(f'There are no csv files in the given directory "{results_path}"!')
-    frame_all = {"model": [], "category": [], "metric": [], "score": []}
-    frame_per_metric = {}
-    for model_name, model_statistics in all_statistics.items():
-        for metric, metric_statistics in model_statistics.items():
-            if frame_per_metric.get(metric) is None:
-                frame_per_metric[metric] = {"model": [], "category": [], "score": []}
-            for category, category_score in metric_statistics.items():
-                frame_all["model"].append(model_name)
-                frame_all["category"].append(category)
-                frame_all["metric"].append(metric)
-                frame_all["score"].append(category_score)
-                frame_per_metric[metric]["model"].append(model_name)
-                frame_per_metric[metric]["category"].append(category)
-                frame_per_metric[metric]["score"].append(category_score)
-    if not os.path.exists(save_path):
-        os.makedirs(save_path)
-    frame_all = pd.DataFrame(frame_all)
-    frame_all.to_csv(os.path.join(save_path, "automatic_evaluation_statistics.csv"))
-    for metric in tqdm.tqdm(
-        frame_per_metric.keys(),
-        desc=f"automatic metrics: ",
-        total=len(frame_per_metric.keys()),
-    ):
-        data = pd.DataFrame(frame_per_metric[metric])
-        sns.set()
-        fig = plt.figure(figsize=(16, 10))
-        fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True)
-        fig.set_title(f"Comparison between Different Models for Metric {metric.title()}")
-        plt.xlabel("Evaluation Category")
-        plt.ylabel("Score")
-        figure = fig.get_figure()
-        figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400)
-        plt.close()
--- a/applications/ColossalEval/README.md
+++ b/applications/ColossalEval/README.md
--- a/applications/ColossalEval/colossal_eval/__init__.py
+++ b/applications/ColossalEval/colossal_eval/__init__.py
--- a/applications/ColossalEval/colossal_eval/dataset/__init__.py
+++ b/applications/ColossalEval/colossal_eval/dataset/__init__.py
+from .agieval import AGIEvalDataset
+from .base import BaseDataset
+from .ceval import CEvalDataset
+from .cmmlu import CMMLUDataset
+from .colossalai import ColossalDataset
+from .gaokaobench import GaoKaoBenchDataset
+from .longbench import LongBenchDataset
+from .mmlu import MMLUDataset
+__all__ = [
+    "AGIEvalDataset",
+    "BaseDataset",
+    "CEvalDataset",
+    "CMMLUDataset",
+    "GaoKaoBenchDataset",
+    "LongBenchDataset",
+    "MMLUDataset",
+    "ColossalDataset",
+]
--- a/applications/ColossalEval/colossal_eval/dataset/agieval.py
+++ b/applications/ColossalEval/colossal_eval/dataset/agieval.py
+# Adapted from https://github.com/ruixiangcui/AGIEval/blob/main/src/dataset_loader.py.
+import ast
+import glob
+import os
+from copy import deepcopy
+from typing import Dict, List
+import pandas as pd
+from colossal_eval.utils import get_json_list
+from colossalai.logging import DistributedLogger
+from .base import BaseDataset
+# define the datasets
+english_qa_datasets = [
+    "lsat-ar",
+    "lsat-lr",
+    "lsat-rc",
+    "logiqa-en",
+    "sat-math",
+    "sat-en",
+    "aqua-rat",
+    "sat-en-without-passage",
+    "gaokao-english",
+]
+chinese_qa_datasets = [
+    "logiqa-zh",
+    "jec-qa-kd",
+    "jec-qa-ca",
+    "gaokao-chinese",
+    "gaokao-geography",
+    "gaokao-history",
+    "gaokao-biology",
+    "gaokao-chemistry",
+    "gaokao-physics",
+    "gaokao-mathqa",
+]
+english_cloze_datasets = ["math"]
+chinese_cloze_datasets = ["gaokao-mathcloze"]
+multi_choice_datasets = ["jec-qa-kd", "jec-qa-ca", "gaokao-physics", "gaokao-mathqa"]
+math_output_datasets = {"gaokao-mathcloze", "math"}
+default_inference_kwargs = {
+    "calculate_loss": True,
+    "all_classes": None,
+    "language": "Chinese",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+def get_prompt(line: Dict, dataset_name: str, logger: DistributedLogger) -> Dict:
+    """Modified from https://github.com/microsoft/AGIEval/blob/main/src/dataset_loader.py#L190"""
+    try:
+        all_classes = None
+        passage = line["passage"] if line["passage"] is not None else ""
+        if dataset_name in english_qa_datasets:
+            option_string = "ABCDEFG"
+            count = len(line["options"])
+            input = (
+                "Question: "
+                + line["question"]
+                + " "
+                + "Choose from the following options: "
+                + " ".join(line["options"])
+                + "\n"
+                + "Answer: "
+            )
+            all_classes = list(option_string[0:count])
+        elif dataset_name in chinese_qa_datasets:
+            option_string = "ABCDEFG"
+            count = len(line["options"])
+            input = "问题：" + line["question"] + " " + "从以下选项中选择：" + " ".join(line["options"]) + "\n" + "答案："
+            all_classes = list(option_string[0:count])
+        elif dataset_name in english_cloze_datasets:
+            input = "Question: " + line["question"] + "\n" + "Answer: "
+        elif dataset_name in chinese_cloze_datasets:
+            input = "问题：" + line["question"] + "\n" + "答案："
+        return {
+            "instruction": input if not passage else passage + "\n\n" + input,
+            "target": line["label"] if line["label"] else line["answer"],
+        }, all_classes
+    except NameError:
+        logger.info("Dataset not defined.")
+# process few-shot raw_prompts
+def combine_prompt(prompt_path, dataset_name, load_explanation=True, chat_mode=False):
+    skip_passage = False
+    if dataset_name == "sat-en-without-passage":
+        skip_passage = True
+        dataset_name = "sat-en"
+    demostrations = []
+    # read the prompts by context and explanation
+    context_row = [0, 1, 3, 5, 7, 9]
+    explanation_row = [0, 2, 4, 6, 8, 10]
+    raw_prompts_context = pd.read_csv(
+        prompt_path, header=0, skiprows=lambda x: x not in context_row, keep_default_na=False
+    )
+    raw_prompts_explanation = pd.read_csv(
+        prompt_path, header=0, skiprows=lambda x: x not in explanation_row, keep_default_na=False
+    ).replace(r"\n\n", "\n", regex=True)
+    contexts = []
+    for line in list(raw_prompts_context[dataset_name]):
+        if line:
+            # print(line)
+            contexts.append(ast.literal_eval(line))
+    explanations = [exp for exp in raw_prompts_explanation[dataset_name] if exp]
+    for idx, (con, exp) in enumerate(zip(contexts, explanations)):
+        passage = con["passage"] if con["passage"] is not None and not skip_passage else ""
+        question = con["question"]
+        options = con["options"] if con["options"] is not None else ""
+        label = con["label"] if con["label"] is not None else ""
+        answer = con["answer"] if "answer" in con and con["answer"] is not None else ""
+        if dataset_name in english_qa_datasets:
+            question_input = (
+                "Question: "
+                + passage
+                + " "
+                + question
+                + "\n"
+                + "Choose from the following options: "
+                + " ".join(options)
+                + "\n"
+                + "Answer: {}".format(label)
+            )
+        elif dataset_name in chinese_qa_datasets:
+            question_input = (
+                "问题：" + passage + " " + question + "\n" + "从以下选项中选择：" + " ".join(options) + "\n" + "答案：{}".format(label)
+            )
+        elif dataset_name in english_cloze_datasets:
+            question_input = "Question: ".format(idx + 1) + question + "\n" + "Answer: {}".format(answer)
+        elif dataset_name in chinese_cloze_datasets:
+            question_input = "问题：" + question + "\n" + "答案：{}".format(answer)
+        else:
+            raise ValueError(f"During loading few-sot examples, found unknown dataset: {dataset_name}")
+        if chat_mode:
+            demostrations.append((question_input,))
+        else:
+            demostrations.append(question_input + "\n")
+    return demostrations
+class AGIEvalDataset(BaseDataset):
+    """
+    Dataset wrapper for AGIEval dataset.
+    Data source: https://github.com/microsoft/AGIEval
+    This dataset class will convert the original dataset into the inference dataset.
+    A few dirty data needed to be manually corrected in the origin dataset:
+    Issue link: https://github.com/microsoft/AGIEval/issues/16
+    1. Invalid options in line 190 in gaokao-chemistry.jsonl.
+    2. Option D (They may increase in value as those same resources become rare on Earth.) missing in line 17 in sat-en-without-passage.jsonl.
+    3. Option D (They may increase in value as those same resources become rare on Earth.) missing in line 17 in sat-en.jsonl.
+    4. Option D (No, because the data do not indicate whether the honeybees had been infected with mites.) missing in line 57 in sat-en-without-passage.jsonl.
+    5. Option D (No, because the data do not indicate whether the honeybees had been infected with mites.) missing in line 57 in sat-en.jsonl.
+    6. Option D (Published theories of scientists who developed earlier models of the Venus flytrap) missing in line 98 in sat-en-without-passage.jsonl.
+    7. Option D (Published theories of scientists who developed earlier models of the Venus flytrap) missing in line 98 in sat-en.jsonl.
+    8. Label is empty in line 212 in jec-qa-kd.jsonl. Content is also dirty.
+    9. Actually, gaokao-mathqa.jsonl is also a multi-choice dataset. See line 149 286 287.
+    """
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"test": {}}
+        files = glob.glob(os.path.join(path, "*.jsonl"))
+        files.sort()
+        if few_shot:
+            prompt_path = os.path.join(path, "few_shot_prompts.csv")
+        for file in files:
+            dataset_name = os.path.basename(file)[0 : -len(".jsonl")]
+            few_shot_data = []
+            if few_shot:
+                # process demo once if it is few-shot-CoT
+                few_shot_data = combine_prompt(prompt_path, dataset_name, load_explanation=False, chat_mode=False)
+            dataset["test"][dataset_name] = {"data": []}
+            file_dir = os.path.join(path, file)
+            loaded_jsonl = get_json_list(file_dir)
+            # It's been tested that each data sample in one subcategory have same inference arguments.
+            _, all_classes = get_prompt(loaded_jsonl[0], dataset_name, logger)
+            inference_kwargs = deepcopy(default_inference_kwargs)
+            if all_classes is not None and dataset_name not in multi_choice_datasets:
+                inference_kwargs["all_classes"] = all_classes
+            if dataset_name in english_qa_datasets:
+                inference_kwargs["language"] = "English"
+            if dataset_name in chinese_qa_datasets:
+                inference_kwargs["language"] = "Chinese"
+            inference_kwargs["few_shot_data"] = few_shot_data
+            dataset["test"][dataset_name]["inference_kwargs"] = inference_kwargs
+            for line in loaded_jsonl:
+                info, all_classes = get_prompt(line, dataset_name, logger)
+                # Convert multi-choice answers to a single string.
+                # We will convert it back when evaluating.
+                # We do this because if target is a list, it should be only used for multiple target answers.
+                if dataset_name in multi_choice_datasets:
+                    if isinstance(info["target"], str) and len(info["target"]) > 1:
+                        # "gaokao-mathqa" actually contain multi-choice questions.
+                        # This if clause is specially used for it.
+                        info["target"] = "".join(info["target"].split())
+                    else:
+                        info["target"] = "".join(info["target"])
+                if isinstance(info["target"], list) and len(info["target"]) == 1:
+                    info["target"] = info["target"][0]
+                data_sample = {
+                    "dataset": "agieval",
+                    "split": "test",
+                    "category": dataset_name,
+                    "instruction": info["instruction"],
+                    "input": "",
+                    "output": "",
+                    "target": info["target"],
+                }
+                dataset["test"][dataset_name]["data"].append(data_sample)
+        return dataset
--- a/applications/ColossalEval/colossal_eval/dataset/base.py
+++ b/applications/ColossalEval/colossal_eval/dataset/base.py
+from abc import abstractstaticmethod
+from colossal_eval.utils import jdump
+class BaseDataset:
+    """
+    Base class for dataset wrapper.
+    Args:
+        path: The path to the original dataset.
+        logger: Logger for the dataset.
+    """
+    def __init__(self, path, logger, few_shot):
+        self.dataset = self.load(path, logger, few_shot)
+    def save(self, save_path):
+        """Save the converted dataset"""
+        jdump(self.dataset, save_path)
+    @abstractstaticmethod
+    def load(path, logger):
+        """Load the original dataset and convert it into the inference dataset"""
--- a/applications/ColossalEval/colossal_eval/dataset/ceval.py
+++ b/applications/ColossalEval/colossal_eval/dataset/ceval.py
+import copy
+import csv
+import os
+from typing import Dict, List
+from colossalai.logging import DistributedLogger
+from .base import BaseDataset
+ceval_subject_mapping = {
+    "computer_network": ["Computer Network", "计算机网络", "STEM"],
+    "operating_system": ["Operating System", "操作系统", "STEM"],
+    "computer_architecture": ["Computer Architecture", "计算机组成", "STEM"],
+    "college_programming": ["College Programming", "大学编程", "STEM"],
+    "college_physics": ["College Physics", "大学物理", "STEM"],
+    "college_chemistry": ["College Chemistry", "大学化学", "STEM"],
+    "advanced_mathematics": ["Advanced Mathematics", "高等数学", "STEM"],
+    "probability_and_statistics": ["Probability and Statistics", "概率统计", "STEM"],
+    "discrete_mathematics": ["Discrete Mathematics", "离散数学", "STEM"],
+    "electrical_engineer": ["Electrical Engineer", "注册电气工程师", "STEM"],
+    "metrology_engineer": ["Metrology Engineer", "注册计量师", "STEM"],
+    "high_school_mathematics": ["High School Mathematics", "高中数学", "STEM"],
+    "high_school_physics": ["High School Physics", "高中物理", "STEM"],
+    "high_school_chemistry": ["High School Chemistry", "高中化学", "STEM"],
+    "high_school_biology": ["High School Biology", "高中生物", "STEM"],
+    "middle_school_mathematics": ["Middle School Mathematics", "初中数学", "STEM"],
+    "middle_school_biology": ["Middle School Biology", "初中生物", "STEM"],
+    "middle_school_physics": ["Middle School Physics", "初中物理", "STEM"],
+    "middle_school_chemistry": ["Middle School Chemistry", "初中化学", "STEM"],
+    "veterinary_medicine": ["Veterinary Medicine", "兽医学", "STEM"],
+    "college_economics": ["College Economics", "大学经济学", "Social Science"],
+    "business_administration": ["Business Administration", "工商管理", "Social Science"],
+    "marxism": ["Marxism", "马克思主义基本原理", "Social Science"],
+    "mao_zedong_thought": ["Mao Zedong Thought", "毛泽东思想和中国特色社会主义理论体系概论", "Social Science"],
+    "education_science": ["Education Science", "教育学", "Social Science"],
+    "teacher_qualification": ["Teacher Qualification", "教师资格", "Social Science"],
+    "high_school_politics": ["High School Politics", "高中政治", "Social Science"],
+    "high_school_geography": ["High School Geography", "高中地理", "Social Science"],
+    "middle_school_politics": ["Middle School Politics", "初中政治", "Social Science"],
+    "middle_school_geography": ["Middle School Geography", "初中地理", "Social Science"],
+    "modern_chinese_history": ["Modern Chinese History", "近代史纲要", "Humanities"],
+    "ideological_and_moral_cultivation": ["Ideological and Moral Cultivation", "思想道德修养与法律基础", "Humanities"],
+    "logic": ["Logic", "逻辑学", "Humanities"],
+    "law": ["Law", "法学", "Humanities"],
+    "chinese_language_and_literature": ["Chinese Language and Literature", "中国语言文学", "Humanities"],
+    "art_studies": ["Art Studies", "艺术学", "Humanities"],
+    "professional_tour_guide": ["Professional Tour Guide", "导游资格", "Humanities"],
+    "legal_professional": ["Legal Professional", "法律职业资格", "Humanities"],
+    "high_school_chinese": ["High School Chinese", "高中语文", "Humanities"],
+    "high_school_history": ["High School History", "高中历史", "Humanities"],
+    "middle_school_history": ["Middle School History", "初中历史", "Humanities"],
+    "civil_servant": ["Civil Servant", "公务员", "Other"],
+    "sports_science": ["Sports Science", "体育学", "Other"],
+    "plant_protection": ["Plant Protection", "植物保护", "Other"],
+    "basic_medicine": ["Basic Medicine", "基础医学", "Other"],
+    "clinical_medicine": ["Clinical Medicine", "临床医学", "Other"],
+    "urban_and_rural_planner": ["Urban and Rural Planner", "注册城乡规划师", "Other"],
+    "accountant": ["Accountant", "注册会计师", "Other"],
+    "fire_engineer": ["Fire Engineer", "注册消防工程师", "Other"],
+    "environmental_impact_assessment_engineer": ["Environmental Impact Assessment Engineer", "环境影响评价工程师", "Other"],
+    "tax_accountant": ["Tax Accountant", "税务师", "Other"],
+    "physician": ["Physician", "医师资格", "Other"],
+}
+default_inference_kwargs = {
+    "calculate_loss": False,
+    "all_classes": ["A", "B", "C", "D"],
+    "language": "Chinese",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+def get_few_shot_data(data: List[Dict]):
+    few_shot_data = []
+    for i in data:
+        few_shot_data.append(i["input"] + i["target"])
+    return few_shot_data
+class CEvalDataset(BaseDataset):
+    """
+    Dataset class for CEval dataset.
+    Data source: https://huggingface.co/datasets/ceval/ceval-exam
+    This dataset class will convert the original dataset into the inference dataset.
+    """
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"dev": {}, "test": {}}
+        for split in ["dev", "test"]:
+            files = os.listdir(os.path.join(path, split))
+            files.sort()
+            for file in files:
+                subject = file[0 : -len(f"_{split}.csv")]
+                subject = ceval_subject_mapping[subject][1]
+                file_dir = os.path.join(path, split, file)
+                dataset[split][subject] = {"data": []}
+                # It's been tested that each data sample in one subcategory have same inference arguments.
+                dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
+                if split == "test" and few_shot:
+                    dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
+                        dataset["dev"][subject]["data"]
+                    )
+                with open(file_dir, encoding="utf-8") as f:
+                    reader = csv.reader(f)
+                    _ = next(reader)
+                    for row in reader:
+                        # Dev split have answer and explanation so len(row) is 8
+                        # But test split doesn't contain answer and explanation, so len(row) is 6
+                        assert len(row) >= 6
+                        choices = f"A. {row[2]}\nB. {row[3]}\nC. {row[4]}\nD. {row[5]}"
+                        data_sample = {
+                            "dataset": "ceval",
+                            "split": split,
+                            "category": subject,
+                            "instruction": f"以下是中国关于{subject}考试的单项选择题，请选出其中的正确答案。",
+                            "input": f"题目：{row[1]}\n{choices}\n答案：",
+                            "output": "",
+                            "target": row[6] if split == "dev" else "",
+                            "id": int(row[0]),
+                        }
+                        dataset[split][subject]["data"].append(data_sample)
+        return dataset
--- a/applications/ColossalEval/colossal_eval/dataset/cmmlu.py
+++ b/applications/ColossalEval/colossal_eval/dataset/cmmlu.py
+import copy
+import csv
+import os
+from typing import Dict, List
+from colossalai.logging import DistributedLogger
+from .base import BaseDataset
+cmmlu_subject_mapping = {
+    "agronomy": "农学",
+    "anatomy": "解剖学",
+    "ancient_chinese": "古汉语",
+    "arts": "艺术学",
+    "astronomy": "天文学",
+    "business_ethics": "商业伦理",
+    "chinese_civil_service_exam": "中国公务员考试",
+    "chinese_driving_rule": "中国驾驶规则",
+    "chinese_food_culture": "中国饮食文化",
+    "chinese_foreign_policy": "中国外交政策",
+    "chinese_history": "中国历史",
+    "chinese_literature": "中国文学",
+    "chinese_teacher_qualification": "中国教师资格",
+    "clinical_knowledge": "临床知识",
+    "college_actuarial_science": "大学精算学",
+    "college_education": "大学教育学",
+    "college_engineering_hydrology": "大学工程水文学",
+    "college_law": "大学法律",
+    "college_mathematics": "大学数学",
+    "college_medical_statistics": "大学医学统计",
+    "college_medicine": "大学医学",
+    "computer_science": "计算机科学",
+    "computer_security": "计算机安全",
+    "conceptual_physics": "概念物理学",
+    "construction_project_management": "建设工程管理",
+    "economics": "经济学",
+    "education": "教育学",
+    "electrical_engineering": "电气工程",
+    "elementary_chinese": "小学语文",
+    "elementary_commonsense": "小学常识",
+    "elementary_information_and_technology": "小学信息技术",
+    "elementary_mathematics": "初等数学",
+    "ethnology": "民族学",
+    "food_science": "食品科学",
+    "genetics": "遗传学",
+    "global_facts": "全球事实",
+    "high_school_biology": "高中生物",
+    "high_school_chemistry": "高中化学",
+    "high_school_geography": "高中地理",
+    "high_school_mathematics": "高中数学",
+    "high_school_physics": "高中物理学",
+    "high_school_politics": "高中政治",
+    "human_sexuality": "人类性行为",
+    "international_law": "国际法学",
+    "journalism": "新闻学",
+    "jurisprudence": "法理学",
+    "legal_and_moral_basis": "法律与道德基础",
+    "logical": "逻辑学",
+    "machine_learning": "机器学习",
+    "management": "管理学",
+    "marketing": "市场营销",
+    "marxist_theory": "马克思主义理论",
+    "modern_chinese": "现代汉语",
+    "nutrition": "营养学",
+    "philosophy": "哲学",
+    "professional_accounting": "专业会计",
+    "professional_law": "专业法学",
+    "professional_medicine": "专业医学",
+    "professional_psychology": "专业心理学",
+    "public_relations": "公共关系",
+    "security_study": "安全研究",
+    "sociology": "社会学",
+    "sports_science": "体育学",
+    "traditional_chinese_medicine": "中医中药",
+    "virology": "病毒学",
+    "world_history": "世界历史",
+    "world_religions": "世界宗教",
+}
+default_inference_kwargs = {
+    "calculate_loss": True,
+    "all_classes": ["A", "B", "C", "D"],
+    "language": "Chinese",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+def get_few_shot_data(data: List[Dict]):
+    few_shot_data = []
+    for i in data:
+        few_shot_data.append(i["input"] + i["target"])
+    return few_shot_data
+class CMMLUDataset(BaseDataset):
+    """
+    Dataset class for CMMLU dataset.
+    Data source: https://github.com/haonan-li/CMMLU/tree/master/data
+    This dataset class will convert the original dataset into the inference dataset.
+    """
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"dev": {}, "test": {}}
+        for split in ["dev", "test"]:
+            files = os.listdir(os.path.join(path, split))
+            files.sort()
+            for file in files:
+                subject = file[0 : -len(".csv")]
+                subject = cmmlu_subject_mapping[subject]
+                file_dir = os.path.join(path, split, file)
+                dataset[split][subject] = {"data": []}
+                # It's been tested that each data sample in one subcategory have same inference arguments.
+                dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
+                if split == "test" and few_shot:
+                    dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
+                        dataset["dev"][subject]["data"]
+                    )
+                with open(file_dir, encoding="utf-8") as f:
+                    reader = csv.reader(f)
+                    _ = next(reader)
+                    for row in reader:
+                        assert len(row) == 7
+                        choices = f"A. {row[2]}\nB. {row[3]}\nC. {row[4]}\nD. {row[5]}"
+                        data_sample = {
+                            "dataset": "cmmlu",
+                            "split": split,
+                            "category": subject,
+                            "instruction": f"以下是关于{subject}的单项选择题，请直接给出正确答案的选项。",
+                            "input": f"题目：{row[1]}\n{choices}\n答案：",
+                            "output": "",
+                            "target": row[6],
+                        }
+                        dataset[split][subject]["data"].append(data_sample)
+        return dataset
--- a/applications/ColossalEval/colossal_eval/dataset/colossalai.py
+++ b/applications/ColossalEval/colossal_eval/dataset/colossalai.py
+from collections import defaultdict
+from copy import deepcopy
+from typing import Dict, List
+from colossal_eval.utils import jload
+from colossalai.logging import DistributedLogger
+from .base import BaseDataset
+default_inference_kwargs = {
+    "calculate_loss": False,
+    "all_classes": None,
+    "language": "Chinese",
+    "pretrain": False,
+    "max_new_tokens": 256,
+}
+# You can add your own subcategory questions and specify whether it is a single-choice question or has target answers and need to calculate loss.
+single_choice_question = set()
+calculate_loss = set()
+def get_data_per_category(data):
+    data_per_category = defaultdict(list)
+    for item in data:
+        category = item["category"]
+        data_per_category[category].append(item)
+    return data_per_category
+class ColossalDataset(BaseDataset):
+    """
+    Dataset class for Colossal dataset.
+    This dataset class will convert the original dataset into the inference dataset.
+    """
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"test": {}}
+        data = jload(path)
+        data_per_category = get_data_per_category(data)
+        categories = list(data_per_category.keys())
+        for category in categories:
+            dataset["test"][category] = {"data": []}
+            category_data = data_per_category[category]
+            dataset["test"][category]["inference_kwargs"] = deepcopy(default_inference_kwargs)
+            if category in calculate_loss:
+                dataset["test"][category]["inference_kwargs"]["calculate_loss"] = True
+            if category in single_choice_question:
+                dataset["test"][category]["inference_kwargs"]["all_classes"] = ["A", "B", "C", "D"]
+            for item in category_data:
+                data_sample = {
+                    "dataset": "colossal",
+                    "split": "test",
+                    "category": category,
+                    "instruction": item["instruction"],
+                    "input": item["input"],
+                    "output": "",
+                    "target": item["target"],
+                    "id": item["id"],
+                }
+                dataset["test"][category]["data"].append(data_sample)
+        return dataset
--- a/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py
+++ b/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py
+import json
+import os
+import re
+from copy import deepcopy
+from typing import Dict, List
+from colossalai.logging import DistributedLogger
+from .base import BaseDataset
+multi_choice_datasets = [
+    "Chinese Lang and Usage MCQs",
+    "Chinese Modern Lit",
+    "English Fill in Blanks",
+    "English Reading Comp",
+    "Geography MCQs",
+    "Physics MCQs",
+    "English Cloze Test",
+]
+chinese_qa_datasets = [
+    "Biology MCQs",
+    "Chemistry MCQs",
+    "Chinese Lang and Usage MCQs",
+    "Chinese Modern Lit",
+    "Geography MCQs",
+    "History MCQs",
+    "Math I MCQs",
+    "Math II MCQs",
+    "Physics MCQs",
+    "Political Science MCQs",
+]
+english_qa_datasets = ["English MCQs", "English Fill in Blanks", "English Reading Comp", "English Cloze Test"]
+default_inference_kwargs = {
+    "calculate_loss": True,
+    "all_classes": None,
+    "language": "Chinese",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+def get_all_classes(instruction: str):
+    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    pattern = r"([A-Z]\. |[A-Z]．|[A-Z]\.)"
+    options = sorted(list(set(re.findall(pattern, instruction))))
+    options = sorted(list(set([string[0] for string in options])))
+    for i in range(len(options)):
+        if options[i] == letters[i]:
+            continue
+        else:
+            return options[0:i]
+    return options
+class GaoKaoBenchDataset(BaseDataset):
+    """
+    Dataset class for GAOKAO-Bench dataset.
+    Data source: https://github.com/OpenLMLab/GAOKAO-Bench/tree/main/data
+    This dataset class will convert the original dataset into the inference dataset.
+    A few typos needed to be manually corrected in the origin dataset, some of the following is fixed.
+    Issue link: https://github.com/OpenLMLab/GAOKAO-Bench/issues/20
+    1. Option C missing in index 111 in 2010-2022_Chemistry_MCQs.json
+    2. Option B missing "." after it in index 16 in 2012-2022_English_Cloze_Test.json
+    3. Option G missing "." after it in index 23 in 2012-2022_English_Cloze_Test.json
+    """
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"test": {}}
+        for category in ["Fill-in-the-blank_Questions", "Multiple-choice_Questions", "Open-ended_Questions"]:
+            files = os.listdir(os.path.join(path, "data", category))
+            files.sort()
+            for file in files:
+                subject = file[10:-5].split("_")
+                subject = " ".join(subject)
+                dataset["test"][subject] = {"data": []}
+                file_dir = os.path.join(path, "data", category, file)
+                with open(file_dir, encoding="utf-8") as f:
+                    data = json.load(f)
+                    # It's been tested that each data sample in one subcategory have same inference arguments.
+                    inference_kwargs = deepcopy(default_inference_kwargs)
+                    if category == "Multiple-choice_Questions" and subject not in multi_choice_datasets:
+                        all_classes = get_all_classes(data["example"][0]["question"])
+                        inference_kwargs["all_classes"] = all_classes
+                    if subject in english_qa_datasets:
+                        inference_kwargs["language"] = "English"
+                    if subject in chinese_qa_datasets:
+                        inference_kwargs["language"] = "Chinese"
+                    dataset["test"][subject]["inference_kwargs"] = inference_kwargs
+                    for sample in data["example"]:
+                        # Convert multi-choice answers to a single string.
+                        # We will convert it back when evaluating.
+                        # We do this because if target is a list, it should be only used for multiple target answers.
+                        if subject in multi_choice_datasets:
+                            sample["answer"] = "".join(sample["answer"])
+                        if isinstance(sample["answer"], list) and len(sample["answer"]) == 1:
+                            sample["answer"] = sample["answer"][0]
+                        data_sample = {
+                            "dataset": "gaokaobench",
+                            "split": "test",
+                            "category": f"{category[:-10]}-{subject}",
+                            "instruction": sample["question"].strip() + "\n答案：",
+                            "input": "",
+                            "output": "",
+                            "target": sample["answer"],
+                        }
+                        dataset["test"][subject]["data"].append(data_sample)
+        return dataset
--- a/applications/ColossalEval/colossal_eval/dataset/longbench.py
+++ b/applications/ColossalEval/colossal_eval/dataset/longbench.py
+import os
+from copy import deepcopy
+from typing import Dict, List
+from colossal_eval.utils import get_json_list
+from colossalai.logging import DistributedLogger
+from .base import BaseDataset
+dataset2prompt = {
+    "narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
+    "qasper": 'You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
+    "multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+    "multifieldqa_zh": "阅读以下文字并用中文简短回答：\n\n{context}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{input}\n回答：",
+    "hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+    "2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+    "musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
+    "dureader": "请基于给定的文章回答下述问题。\n\n文章：{context}\n\n请基于上述文章回答下面的问题。\n\n问题：{input}\n回答：",
+    "gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:",
+    "qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:",
+    "multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:",
+    "vcsum": "下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{context}\n\n会议总结：",
+    "trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}",
+    "triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}",
+    "samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}",
+    "lsht": "请判断给定新闻的类别，下面是一些例子。\n\n{context}\n{input}",
+    "passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ",
+    "passage_retrieval_en": 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
+    "passage_retrieval_zh": '以下是若干段落文字，以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1"，"段落2"等格式\n\n答案是：',
+    "lcc": "Please complete the code given below. \n{context}Next line of code:\n",
+    "repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n",
+}
+dataset2maxlen = {
+    "narrativeqa": 128,
+    "qasper": 128,
+    "multifieldqa_en": 64,
+    "multifieldqa_zh": 64,
+    "hotpotqa": 32,
+    "2wikimqa": 32,
+    "musique": 32,
+    "dureader": 128,
+    "gov_report": 512,
+    "qmsum": 512,
+    "multi_news": 512,
+    "vcsum": 512,
+    "trec": 64,
+    "triviaqa": 32,
+    "samsum": 128,
+    "lsht": 64,
+    "passage_count": 32,
+    "passage_retrieval_en": 32,
+    "passage_retrieval_zh": 32,
+    "lcc": 64,
+    "repobench-p": 64,
+}
+default_inference_kwargs = {
+    "calculate_loss": True,
+    "all_classes": None,
+    "language": "Chinese",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+class LongBenchDataset(BaseDataset):
+    """
+    Dataset class for LongBench dataset.
+    Data source: https://huggingface.co/datasets/THUDM/LongBench
+    This dataset class will convert the original dataset into the inference dataset.
+    Issue link: https://github.com/THUDM/LongBench/issues/15 (fixed)
+    There are duplicate target answers in `nq.jsonl`, but this doesn't affect evaluation results.
+    Also doesn't affect perplexity calculation (the program only need to select the minimum loss).
+    """
+    @staticmethod
+    def load(path: str, logger: DistributedLogger) -> List[Dict]:
+        dataset = {"test": {}}
+        files = os.listdir(path)
+        files.sort()
+        for file in files:
+            category = file[0:-6]
+            if category.endswith("_e"):
+                continue
+            dataset["test"][category] = {"data": []}
+            file_dir = os.path.join(path, file)
+            loaded_jsonl = get_json_list(file_dir)
+            # It's been tested that each data sample in one subcategory have same inference arguments.
+            inference_kwargs = deepcopy(default_inference_kwargs)
+            if loaded_jsonl[0]["all_classes"] is not None:
+                inference_kwargs["all_classes"] = loaded_jsonl[0]["all_classes"]
+            inference_kwargs["max_new_tokens"] = dataset2maxlen[category]
+            dataset["test"][category]["inference_kwargs"] = inference_kwargs
+            for sample in loaded_jsonl:
+                prompt = dataset2prompt[category].format(**sample)
+                data_sample = {
+                    "dataset": "longbench",
+                    "split": "test",
+                    "category": category,
+                    "instruction": prompt,
+                    "input": "",
+                    "output": "",
+                    "target": sample["answers"],
+                }
+                dataset["test"][category]["data"].append(data_sample)
+        return dataset
--- a/applications/ColossalEval/colossal_eval/dataset/mmlu.py
+++ b/applications/ColossalEval/colossal_eval/dataset/mmlu.py
+import copy
+import csv
+import os
+from typing import Dict, List
+from colossalai.logging import DistributedLogger
+from .base import BaseDataset
+default_inference_kwargs = {
+    "calculate_loss": True,
+    "all_classes": ["A", "B", "C", "D"],
+    "language": "English",
+    "pretrain": False,
+    "max_new_tokens": 32,
+}
+def get_few_shot_data(data: List[Dict]):
+    few_shot_data = []
+    for i in data:
+        few_shot_data.append(i["input"] + i["target"])
+    return few_shot_data
+class MMLUDataset(BaseDataset):
+    """
+    Dataset class for MMLU dataset.
+    Data source: https://github.com/hendrycks/test
+    This dataset class will convert the original dataset into the inference dataset.
+    """
+    @staticmethod
+    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+        dataset = {"dev": {}, "test": {}}
+        for split in ["dev", "test"]:
+            files = os.listdir(os.path.join(path, split))
+            files.sort()
+            for file in files:
+                subject = file[0 : -len(f"_{split}.csv")].split("_")
+                subject = " ".join([word.title() if word != "us" else "US" for word in subject])
+                file_dir = os.path.join(path, split, file)
+                dataset[split][subject] = {"data": [], "inference_kwargs": {}}
+                # It's been tested that each data sample in one subcategory have same inference arguments.
+                dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
+                if split == "test" and few_shot:
+                    dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
+                        dataset["dev"][subject]["data"]
+                    )
+                with open(file_dir, encoding="utf-8") as f:
+                    reader = csv.reader(f)
+                    for row in reader:
+                        assert len(row) == 6
+                        choices = f"A. {row[1]}\nB. {row[2]}\nC. {row[3]}\nD. {row[4]}"
+                        data_sample = {
+                            "dataset": "mmlu",
+                            "split": split,
+                            "category": subject,
+                            "instruction": f"The following is a single-choice question on {subject}. Answer the question by replying A, B, C or D.",
+                            "input": f"Question: {row[0]}\n{choices}\nAnswer: ",
+                            "output": "",
+                            "target": row[5],
+                        }
+                        dataset[split][subject]["data"].append(data_sample)
+        return dataset