Unverified Commit ce777853 authored by Yuanchen's avatar Yuanchen Committed by GitHub
Browse files

[feature] ColossalEval: Evaluation Pipeline for LLMs (#4786)



* Add ColossalEval

* Delete evaluate in Chat

---------
Co-authored-by: default avatarXu Yuanchen <yuanchen.xu00@gmail.com>
Co-authored-by: default avatarTong Li <tong.li352711588@gmail.com>
parent 74aa7d96
This diff is collapsed.
{
"language": "cn",
"category": {
"brainstorming": {
"GPT": [
"language organization",
"relevance",
"creativity",
"practicality",
"reasonableness"
],
"Metrics": [
"Distinct"
]
},
"chat": {
"GPT": [
"language organization",
"naturalness",
"engagingness",
"fidelity"
],
"Metrics": [
"Distinct"
]
},
"classification": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score",
"CHRF"
]
},
"closed_qa": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore",
"CHRF"
]
},
"extraction": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score",
"CHRF"
]
},
"generation": {
"GPT": [
"language organization",
"relevance",
"diversity"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"logical_reasoning": {
"GPT": [
"correctness",
"relevance",
"reasonableness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore",
"CHRF"
]
},
"open_qa": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Distinct"
]
},
"rewriting": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"roleplay": {
"GPT": [
"language organization",
"relevance",
"fidelity",
"creativity"
],
"Metrics": [
"Distinct"
]
},
"summarization": {
"GPT": [
"language organization",
"relevance",
"correctness",
"conciseness"
],
"Metrics": [
]
},
"Finance": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
]
},
"Law": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
]
},
"Education": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
]
},
"Medical": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
]
},
"STEM": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
]
},
"SocialScience": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
]
},
"Humanity": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
]
},
"Other": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
]
},
"ethics": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
]
}
}
}
{
"language": "en",
"path_for_UniEval": {
"summarization": "path to unieval-sum",
"dialogue": "path to unieval-dialog",
"data2text": "path to unieval-sum"
},
"category": {
"brainstorming": {
"GPT": [
"language organization",
"relevance",
"creativity",
"practicality",
"reasonableness"
],
"Metrics": [
"Distinct"
],
"UniEval": [
"summarization-fluency",
"data2text-naturalness",
"data2text-informativeness"
]
},
"chat": {
"GPT": [
"language organization",
"naturalness",
"engagingness",
"fidelity"
],
"Metrics": [
"Distinct"
],
"UniEval": [
"summarization-fluency",
"dialogue-naturalness",
"dialogue-coherence",
"dialogue-understandability",
"data2text-naturalness",
"data2text-informativeness"
]
},
"classification": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score",
"CHRF"
],
"UniEval": [
"summarization-fluency",
"data2text-naturalness",
"data2text-informativeness"
]
},
"closed_qa": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore",
"CHRF"
],
"UniEval": [
"summarization-fluency",
"data2text-naturalness",
"data2text-informativeness"
]
},
"extraction": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score",
"CHRF"
],
"UniEval": [
"summarization-fluency",
"data2text-naturalness",
"data2text-informativeness"
]
},
"generation": {
"GPT": [
"language organization",
"relevance",
"diversity"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
],
"UniEval": [
"summarization-fluency",
"data2text-naturalness",
"data2text-informativeness"
]
},
"logical_reasoning": {
"GPT": [
"correctness",
"relevance",
"reasonableness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore",
"CHRF"
],
"UniEval": [
]
},
"open_qa": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Distinct"
],
"UniEval": [
"summarization-fluency",
"data2text-naturalness",
"data2text-informativeness"
]
},
"rewriting": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
],
"UniEval": [
"summarization-fluency",
"data2text-naturalness",
"data2text-informativeness"
]
},
"roleplay": {
"GPT": [
"language organization",
"relevance",
"fidelity",
"creativity"
],
"Metrics": [
"Distinct"
],
"UniEval": [
"summarization-fluency",
"data2text-naturalness",
"data2text-informativeness"
]
},
"summarization": {
"GPT": [
"language organization",
"relevance",
"correctness",
"conciseness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore",
"CHRF"
],
"UniEval": [
]
},
"Finance": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
],
"UniEval": [
]
},
"Law": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
],
"UniEval": [
]
},
"Education": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
],
"UniEval": [
]
},
"Medical": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
],
"UniEval": [
]
},
"STEM": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
],
"UniEval": [
]
},
"SocialScience": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
],
"UniEval": [
]
},
"Humanity": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
],
"UniEval": [
]
},
"Other": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
],
"UniEval": [
]
},
"ethics": {
"GPT": [
"relevance",
"correctness"
],
"Metrics": [
],
"UniEval": [
]
}
}
}
import statistics
from typing import Dict, List
import jieba
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.chrf_score import sentence_chrf
from rouge_chinese import Rouge as Rouge_cn
from rouge_score import rouge_scorer as Rouge_en
from sklearn.metrics import f1_score, precision_score, recall_score
from utils import preprocessing_text, remove_redundant_space
def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate BLEU Score Metric
The calculation includes BLEU-1 for unigram, BLEU-2 for bigram,
BLEU-3 for trigram and BLEU-4 for 4-gram. Unigram evaluates the
accuracy in word level, other n-gram evaluate the fluency in
sentence level.
"""
bleu_scores = {"bleu1": 0, "bleu2": 0, "bleu3": 0, "bleu4": 0}
cumulative_bleu = [0] * 4
weights = [
(1.0 / 1.0, 0.0, 0.0, 0.0),
(1.0 / 2.0, 1.0 / 2.0, 0.0, 0.0),
(1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0, 0.0),
(1.0 / 4.0, 1.0 / 4.0, 1.0 / 4.0, 1.0 / 4.0),
]
for pred, target in zip(preds, targets):
if language == "cn":
pred_list = " ".join(jieba.cut(preprocessing_text(pred))).split()
target_list = [(" ".join(jieba.cut(preprocessing_text(target)))).split()]
elif language == "en":
pred_list = preprocessing_text(pred).split()
target_list = [preprocessing_text(target).split()]
bleu = sentence_bleu(target_list, pred_list, weights=weights)
cumulative_bleu = [a + b for a, b in zip(cumulative_bleu, bleu)]
for i in range(len(cumulative_bleu)):
bleu_scores[f"bleu{i+1}"] = cumulative_bleu[i] / len(preds)
return bleu_scores
def chrf_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate CHRF Score Metric in sentence level."""
chrf_score = {"chrf": 0}
cumulative_chrf = []
for pred, target in zip(preds, targets):
if language == "cn":
pred_list = " ".join(jieba.cut(preprocessing_text(pred))).split()
target_list = " ".join(jieba.cut(preprocessing_text(target))).split()
elif language == "en":
pred_list = preprocessing_text(pred).split()
target_list = preprocessing_text(target).split()
cumulative_chrf.append(sentence_chrf(target_list, pred_list))
chrf_score["chrf"] = statistics.mean(cumulative_chrf)
return chrf_score
def rouge_cn_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
"""Calculate Chinese ROUGE Score Metric
The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
the preds and targets. ROUGE-L measures the number of matching
longest common subsequence (LCS) between preds and targets.
"""
rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
all_preds = []
all_targets = []
for pred, target in zip(preds, targets):
pred_list = remove_redundant_space(" ".join(jieba.cut(preprocessing_text(pred))))
target_list = remove_redundant_space(" ".join(jieba.cut(preprocessing_text(target))))
all_preds.append(pred_list)
all_targets.append(target_list)
rouge_cn = Rouge_cn()
rouge_avg = rouge_cn.get_scores(all_preds, all_targets, avg=True)
rouge_scores["rouge1"] = rouge_avg["rouge-1"]["f"]
rouge_scores["rouge2"] = rouge_avg["rouge-2"]["f"]
rouge_scores["rougeL"] = rouge_avg["rouge-l"]["f"]
return rouge_scores
def rouge_en_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
"""Calculate English ROUGE Score Metric
The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
the preds and targets. ROUGE-L measures the number of matching
longest common subsequence (LCS) between preds and targets.
"""
rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
rouge_en = Rouge_en.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
for pred, target in zip(preds, targets):
score = rouge_en.score(preprocessing_text(pred), preprocessing_text(target))
rouge_scores["rouge1"] += score["rouge1"].fmeasure
rouge_scores["rouge2"] += score["rouge2"].fmeasure
rouge_scores["rougeL"] += score["rougeL"].fmeasure
rouge_scores["rouge1"] = rouge_scores["rouge1"] / len(preds)
rouge_scores["rouge2"] = rouge_scores["rouge2"] / len(preds)
rouge_scores["rougeL"] = rouge_scores["rougeL"] / len(preds)
return rouge_scores
def rouge_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate ROUGE Score Metric"""
if language == "cn":
return rouge_cn_score(preds, targets)
elif language == "en":
return rouge_en_score(preds, targets)
def distinct_score(preds: List[str], language: str) -> Dict[str, float]:
"""Calculate Distinct Score Metric
This metric refers to https://arxiv.org/abs/1510.03055.
It evaluates the diversity of generation text by counting
the unique n-grams.
"""
distinct_score = {"distinct": 0}
cumulative_distinct = []
for pred in preds:
if language == "cn":
pred_seg_list = " ".join(jieba.cut(pred)).split()
count_segs = len(pred_seg_list)
unique_segs = set(pred_seg_list)
count_unique_chars = len(unique_segs)
# prevent denominator from being 0
cumulative_distinct.append(count_unique_chars / (count_segs + 1e-6))
elif language == "en":
# calculate distinct 1-gram, 2-gram, 3-gram
unique_ngram = [set() for _ in range(0, 3)]
all_ngram_count = [0 for _ in range(0, 3)]
split_pred = preprocessing_text(pred).split()
for n in range(0, 3):
for i in range(0, len(split_pred) - n):
ngram = " ".join(split_pred[i : i + n + 1])
unique_ngram[n].add(ngram)
all_ngram_count[n] += 1
# Sometimes the answer may contain only one word. For 2-gram and 3-gram, the gram count(denominator) may be zero.
avg_distinct = [len(a) / (b + 1e-6) for a, b in zip(unique_ngram, all_ngram_count)]
cumulative_distinct.append(statistics.mean(avg_distinct))
distinct_score["distinct"] = statistics.mean(cumulative_distinct)
return distinct_score
def bert_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate BERTScore Metric
The BERTScore evaluates the semantic similarity between
tokens of preds and targets with BERT.
"""
bert_score = {"bert_score": 0}
pred_list = []
target_list = []
for pred, target in zip(preds, targets):
pred_list.append(pred)
target_list.append(target)
if language == "cn":
_, _, F = score(pred_list, target_list, lang="zh", verbose=True)
elif language == "en":
_, _, F = score(pred_list, target_list, lang="en", verbose=True)
bert_score["bert_score"] = F.mean().item()
return bert_score
def calculate_precision_recall_f1(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Precision, Recall and F1-Score Calculation
The calculation of precision, recall and f1-score is realized by counting
the number f overlaps between the preds and target. The comparison length
limited by the shorter one of preds and targets.
"""
precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0}
precision_scores = []
recall_scores = []
f1_scores = []
for pred, target in zip(preds, targets):
if language == "cn":
pred_list = [char for char in " ".join(jieba.cut(preprocessing_text(pred))).split()]
target_list = [char for char in " ".join(jieba.cut(preprocessing_text(target))).split()]
elif language == "en":
pred_list = [char for char in preprocessing_text(pred).split()]
target_list = [char for char in preprocessing_text(target).split()]
target_labels = [1] * min(len(target_list), len(pred_list))
pred_labels = [int(pred_list[i] == target_list[i]) for i in range(0, min(len(target_list), len(pred_list)))]
precision_scores.append(precision_score(target_labels, pred_labels, zero_division=0))
recall_scores.append(recall_score(target_labels, pred_labels, zero_division=0))
f1_scores.append(f1_score(target_labels, pred_labels, zero_division=0))
precision_recall_f1["precision"] = statistics.mean(precision_scores)
precision_recall_f1["recall"] = statistics.mean(recall_scores)
precision_recall_f1["f1_score"] = statistics.mean(f1_scores)
return precision_recall_f1
def precision(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate Precision Metric
Calculating precision by counting the number of overlaps between the preds and target.
"""
precision = {"precision": 0}
precision["precision"] = calculate_precision_recall_f1(preds, targets, language)["precision"]
return precision
def recall(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate Recall Metric
Calculating recall by counting the number of overlaps between the preds and target.
"""
recall = {"recall": 0}
recall["recall"] = calculate_precision_recall_f1(preds, targets, language)["recall"]
return recall
def F1_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate F1-score Metric
Calculating f1-score by counting the number of overlaps between the preds and target.
"""
f1 = {"f1_score": 0}
f1["f1_score"] = calculate_precision_recall_f1(preds, targets, language)["f1_score"]
return f1
from .evaluator import get_evaluator
from .utils import (
analyze_unieval_results,
calculate_average_score,
convert_data_to_unieval_format,
save_unieval_results,
)
__all__ = [
"get_evaluator",
"convert_data_to_unieval_format",
"calculate_average_score",
"save_unieval_results",
"analyze_unieval_results",
]
# MIT License
# Copyright (c) 2022 Ming Zhong
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import numpy as np
from nltk import sent_tokenize
from .scorer import UniEvaluator
from .utils import add_question
class SumEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up evaluator for text summarization"""
self.scorer = UniEvaluator(
model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir,
)
self.task = "summarization"
self.dimensions = ["coherence", "consistency", "fluency", "relevance"]
def evaluate(self, data, category, dims=None, overall=True):
"""
Get the scores of all the given dimensions
category: The category to be evaluated.
dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
four dimensions: coherence, consistency, fluency, relevance.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
"""
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]
if dims == None:
eval_dims = self.dimensions
else:
assert isinstance(dims, list)
eval_dims = dims
for dim in eval_dims:
# Calculate average sentence-level scores for 'consistency' and 'fluency'
if dim == "consistency" or dim == "fluency":
src_list, output_list = [], []
n_sents = [] # the number of sentences in each generated summary
for i in range(n_data):
source = data[i]["source"]
system_outputs = sent_tokenize(data[i]["system_output"])
n_sents.append(len(system_outputs))
for j in range(len(system_outputs)):
src_list.append(source)
output_list.append(system_outputs[j])
input_list = add_question(dimension=dim, output=output_list, src=src_list, task=self.task)
sent_score = self.scorer.score(input_list, self.task, category, dim)
# Get average score for each sample
start_idx = 0
score = []
for cur_n_sent in n_sents:
# prevent denominator from being 0
score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / (cur_n_sent + 1e-6))
start_idx += cur_n_sent
# Calculate summary-level score for 'coherence' and 'relevance'
elif dim == "coherence" or dim == "relevance":
src_list, output_list, ref_list = [], [], []
for i in range(n_data):
src_list.append(data[i]["source"])
output_list.append(data[i]["system_output"])
if dim == "relevance":
ref_list.append(data[i]["reference"])
input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task)
score = self.scorer.score(input_list, self.task, category, dim)
# Please customize other dimensions here for summarization
else:
raise NotImplementedError(
"The input format for this dimension is still undefined. \
Please customize it first."
)
for i in range(n_data):
eval_scores[i][dim] = score[i]
# Customize your overall score here.
if overall == True:
for i in range(n_data):
eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
return eval_scores
class DialogEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up evaluator for dialogues"""
self.scorer = UniEvaluator(
model_name_or_path="MingZhong/unieval-dialog" if model_name_or_path == "" else model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir,
)
self.task = "dialogue"
self.dimensions = ["naturalness", "coherence", "engagingness", "groundedness", "understandability"]
def evaluate(self, data, category, dims=None, overall=True):
"""
Get the scores of all the given dimensions
category: The category to be evaluated.
dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
five dimensions: naturalness, coherence, engagingness, groundedness and understandability.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
"""
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]
if dims == None:
eval_dims = self.dimensions
else:
assert isinstance(dims, list)
eval_dims = dims
for dim in eval_dims:
# Calculate summation score for 'engagingness'
if dim == "engagingness":
src_list, output_list, context_list = [], [], []
n_sents = [] # the number of sentences in each generated response
for i in range(n_data):
source = data[i]["source"]
context = data[i]["context"]
system_outputs = sent_tokenize(data[i]["system_output"])
n_sents.append(len(system_outputs))
for j in range(len(system_outputs)):
src_list.append(source)
context_list.append(context)
output_list.append(system_outputs[j])
input_list = add_question(
dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
)
sent_score = self.scorer.score(input_list, self.task, category, dim)
# Get the summation score for each sample
start_idx = 0
score = []
for cur_n_sent in n_sents:
score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]))
start_idx += cur_n_sent
# Calculate turn-level score for other dimensions
elif dim in ["naturalness", "coherence", "groundedness", "understandability"]:
src_list, output_list, context_list = [], [], []
for i in range(n_data):
src_list.append(data[i]["source"])
output_list.append(data[i]["system_output"])
context_list.append(data[i]["context"])
input_list = add_question(
dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
)
score = self.scorer.score(input_list, self.task, category, dim)
# Please customize other dimensions here for summarization
else:
raise NotImplementedError(
"The input format for this dimension is still undefined. \
Please customize it first."
)
for i in range(n_data):
eval_scores[i][dim] = score[i]
# Customize your overall score here.
if overall == True:
for i in range(n_data):
eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
return eval_scores
class D2tEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up evaluator for data-to-text"""
self.scorer = UniEvaluator(
model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir,
)
self.task = "data2text"
self.dimensions = ["naturalness", "informativeness"]
def evaluate(self, data, category, dims=None, overall=True):
"""
Get the scores of all the given dimensions
category: The category to be evaluated.
dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
two dimensions: naturalness and informativeness.
overall: indicates whether the overall score is to be calculated.
Overall score can be customized to a combination of scores based on different
dimensions. The default here is the average score of all the given dimensions.
"""
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]
if dims == None:
eval_dims = self.dimensions
else:
assert isinstance(dims, list)
eval_dims = dims
for dim in eval_dims:
output_list, ref_list = [], []
for i in range(n_data):
output_list.append(data[i]["system_output"])
ref_list.append(data[i]["reference"])
input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task)
score = self.scorer.score(input_list, self.task, category, dim)
for i in range(n_data):
eval_scores[i][dim] = score[i]
# Customize your overall score here.
if overall == True:
for i in range(n_data):
eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))
return eval_scores
class FactEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up evaluator for factual consistency detection"""
self.scorer = UniEvaluator(
model_name_or_path="MingZhong/unieval-fact" if model_name_or_path == "" else model_name_or_path,
max_length=max_length,
device=device,
cache_dir=cache_dir,
)
self.task = "fact"
self.dim = "consistency"
def evaluate(self, data, category):
"""
Get the factual consistency score (only 1 dimension for this task)
category: The category to be evaluated.
"""
n_data = len(data)
eval_scores = [{} for _ in range(n_data)]
# Calculate average sentence-level scores for factual consistency
src_list, output_list = [], []
n_sents = [] # the number of sentences in the claim
for i in range(n_data):
source = data[i]["source"]
system_outputs = sent_tokenize(data[i]["system_output"])
n_sents.append(len(system_outputs))
for j in range(len(system_outputs)):
src_list.append(source)
output_list.append(system_outputs[j])
input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
sent_score = self.scorer.score(input_list, self.task, category, self.dim)
# Get average score for each sample
start_idx = 0
score = []
for cur_n_sent in n_sents:
score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / cur_n_sent)
start_idx += cur_n_sent
for i in range(n_data):
eval_scores[i][self.dim] = score[i]
return eval_scores
def get_evaluator(task, model_name_or_path="", max_length=1024, device="cuda:0", cache_dir=None):
assert task in ["summarization", "dialogue", "data2text", "fact"]
if task == "summarization":
return SumEvaluator(
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
)
elif task == "dialogue":
return DialogEvaluator(
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
)
elif task == "data2text":
return D2tEvaluator(
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
)
elif task == "fact":
return FactEvaluator(
model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
)
else:
raise NotImplementedError(
"Other tasks are not implemented, \
please customize specific tasks here."
)
# MIT License
# Copyright (c) 2022 Ming Zhong
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch
import torch.nn as nn
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer
class UniEvaluator:
def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
"""Set up model"""
self.device = device
self.max_length = max_length
self.config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, config=self.config, cache_dir=cache_dir)
self.model.eval()
self.model.to(device)
self.softmax = nn.Softmax(dim=1)
self.pos_id = self.tokenizer("Yes")["input_ids"][0]
self.neg_id = self.tokenizer("No")["input_ids"][0]
def score(self, inputs, task, category, dim, batch_size=8):
"""
Get scores for the given samples.
final_score = postive_score / (postive_score + negative_score)
"""
# The implementation of "forward" in T5 still requires decoder_input_ids.
# Therefore, we construct a random one-word target sequence.
# The content of the target has no effect on the final scores.
tgts = ["No" for _ in range(len(inputs))]
pos_score_list, neg_score_list = [], []
for i in tqdm(range(0, len(inputs), batch_size), desc=f"{category}-({dim}-{task}): "):
src_list = inputs[i : i + batch_size]
tgt_list = tgts[i : i + batch_size]
try:
with torch.no_grad():
encoded_src = self.tokenizer(
src_list, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt"
)
encoded_tgt = self.tokenizer(
tgt_list, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt"
)
src_tokens = encoded_src["input_ids"].to(self.device)
src_mask = encoded_src["attention_mask"].to(self.device)
tgt_tokens = encoded_tgt["input_ids"].to(self.device)[:, 0].unsqueeze(-1)
output = self.model(input_ids=src_tokens, attention_mask=src_mask, labels=tgt_tokens)
logits = output.logits.view(-1, self.model.config.vocab_size)
pos_score = self.softmax(logits)[:, self.pos_id] # Yes
neg_score = self.softmax(logits)[:, self.neg_id] # No
cur_pos_score = [x.item() for x in pos_score]
cur_neg_score = [x.item() for x in neg_score]
pos_score_list += cur_pos_score
neg_score_list += cur_neg_score
except RuntimeError:
print(f"source: {src_list}")
print(f"target: {tgt_list}")
exit(0)
score_list = []
for i in range(len(pos_score_list)):
score_list.append(pos_score_list[i] / (pos_score_list[i] + neg_score_list[i]))
return score_list
# MIT License
# Copyright (c) 2022 Ming Zhong
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import os
from typing import Dict
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tqdm
def add_question(dimension, output, src=None, ref=None, context=None, task=None):
"""
Add questions to generate input in Bool-QA format for UniEval.
dimension: specific dimension to be evaluated
src: source input for different NLG tasks. For example, source document for summarization
and dialogue history for dialogue response generation.
output: output text generated by the models
ref: human-annotated groundtruth
context: the context needed to evaluate several specific dimension. For example,
additional factual information when evaluating engagingness and groundedness in dialogues.
"""
input_with_question = []
for i in range(len(output)):
# For summarization
if task == "summarization":
if dimension == "fluency":
cur_input = "question: Is this a fluent paragraph? </s> paragraph: " + output[i]
elif dimension == "coherence":
cur_input = (
"question: Is this a coherent summary to the document? </s> summary: "
+ output[i]
+ " </s> document: "
+ src[i]
)
elif dimension == "consistency":
cur_input = (
"question: Is this claim consistent with the document? </s> claim: "
+ output[i]
+ " </s> document: "
+ src[i]
)
elif dimension == "relevance":
cur_input = (
"question: Is this summary relevant to the reference? </s> summary: "
+ output[i]
+ " </s> reference: "
+ ref[i]
)
else:
raise NotImplementedError(
"The input format for this dimension is still undefined. Please customize it first."
)
# For dialogues
elif task == "dialogue":
if dimension == "naturalness":
cur_input = "question: Is this a natural response in the dialogue? </s> response: " + output[i]
elif dimension == "coherence":
cur_input = (
"question: Is this a coherent response given the dialogue history? </s> response: "
+ output[i]
+ " </s> dialogue history: "
+ src[i]
)
elif dimension == "engagingness":
cur_input = (
"question: Is this an engaging and informative response according to the dialogue history and fact? </s> response: "
+ output[i]
+ " </s> dialogue history: "
+ src[i]
+ " </s> fact: "
+ context[i]
)
elif dimension == "groundedness":
cur_input = (
"question: Is this response consistent with knowledge in the fact? </s> response: "
+ output[i]
+ " </s> fact: "
+ context[i]
)
elif dimension == "understandability":
cur_input = "question: Is this an understandable response in the dialogue? </s> response: " + output[i]
else:
raise NotImplementedError(
"The input format for this dimension is still undefined. Please customize it first."
)
# For data-to-text
elif task == "data2text":
if dimension == "naturalness":
cur_input = "question: Is this a fluent utterance? </s> utterance: " + output[i]
elif dimension == "informativeness":
cur_input = (
"question: Is this sentence informative according to the reference? </s> sentence: "
+ output[i]
+ " </s> reference: "
+ ref[i]
)
else:
raise NotImplementedError(
"The input format for this dimension is still undefined. Please customize it first."
)
# For factual consistency detection
elif task == "fact":
if dimension == "consistency":
cur_input = (
"question: Is this claim consistent with the document? </s> claim: "
+ output[i]
+ " </s> document: "
+ src[i]
)
else:
raise NotImplementedError("No other dimensions for the factual consistency detection task.")
# For new customized tasks
else:
raise NotImplementedError("Other tasks are not implemented, please customize specific tasks here.")
input_with_question.append(cur_input)
return input_with_question
def convert_data_to_unieval_format(output_list, src_list=None, ref_list=None):
"""
Convert the data into the unieval's format.
output_list: a list of model output
src_list: source input for different NLG tasks. For example, source document for summarization
and dialogue history for dialogue response generation
ref_list: human-annotated groundtruth
"""
json_data = []
for i in range(len(output_list)):
cur = {}
cur["system_output"] = output_list[i]
if src_list is not None:
cur["source"] = src_list[i]
if ref_list is not None:
cur["reference"] = ref_list[i]
cur["context"] = ""
json_data.append(cur)
return json_data
def calculate_average_score(scores):
"""
Calculate average scores for different metrics
scores: a list of scores for different metrics for each answer
"""
metrics = {metric: 0 for metric in scores[0]}
for score in scores:
for metric in score:
metrics[metric] += score[metric]
for metric in metrics:
metrics[metric] /= len(scores)
return metrics
def save_unieval_results(model_name: str, unieval_metric_stats: Dict[str, Dict], save_path: str) -> None:
"""
Save UniEval evaluation results of different categories for one model.
"""
if not os.path.exists(save_path):
os.makedirs(save_path)
unieval_metric_stats_per_category = {}
for task, category_stat in unieval_metric_stats.items():
for category, metric_stat in category_stat.items():
if unieval_metric_stats_per_category.get(category, None) is None:
unieval_metric_stats_per_category[category] = {}
for metric, score in metric_stat.items():
unieval_metric_stats_per_category[category][f"{metric}-{task}"] = score
automatic_df = pd.DataFrame(unieval_metric_stats_per_category)
automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True)
def read_unieval_results(results_path: str, file_name: str) -> Dict[str, Dict]:
"""
Read a csv file and return a dictionary which stores scores per metric.
"""
results = pd.read_csv(os.path.join(results_path, file_name), index_col=0)
results_dict = {metric: {} for metric in list(results.index)}
for i, metric in enumerate(results_dict.keys()):
for j, category in enumerate(list(results.columns)):
if pd.isnull(results.iloc[i][j]):
continue
results_dict[metric][category] = results.iloc[i][j]
return results_dict
def analyze_unieval_results(results_path: str, save_path: str) -> None:
"""
Analyze and visualize all csv files in the given folder.
"""
if not os.path.exists(results_path):
raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!')
all_statistics = {}
for file_name in os.listdir(results_path):
if file_name.endswith("_results.csv"):
model_name = file_name.split("_results.csv")[0]
all_statistics[model_name] = read_unieval_results(results_path, file_name)
if len(list(all_statistics.keys())) == 0:
raise Exception(f'There are no csv files in the given directory "{results_path}"!')
frame_all = {"model": [], "category": [], "metric": [], "score": []}
frame_per_metric = {}
for model_name, model_statistics in all_statistics.items():
for metric, metric_statistics in model_statistics.items():
if frame_per_metric.get(metric) is None:
frame_per_metric[metric] = {"model": [], "category": [], "score": []}
for category, category_score in metric_statistics.items():
frame_all["model"].append(model_name)
frame_all["category"].append(category)
frame_all["metric"].append(metric)
frame_all["score"].append(category_score)
frame_per_metric[metric]["model"].append(model_name)
frame_per_metric[metric]["category"].append(category)
frame_per_metric[metric]["score"].append(category_score)
if not os.path.exists(save_path):
os.makedirs(save_path)
frame_all = pd.DataFrame(frame_all)
frame_all.to_csv(os.path.join(save_path, "unieval_statistics.csv"))
for metric in tqdm.tqdm(
frame_per_metric.keys(),
desc=f"UniEval metrics: ",
total=len(frame_per_metric.keys()),
):
data = pd.DataFrame(frame_per_metric[metric])
sns.set()
fig = plt.figure(figsize=(16, 10))
fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True)
fig.set_title(
f"Comparison between Different Models for Metric {metric.split('-')[0].title()} in Task {metric.split('-')[1].title()}"
)
plt.xlabel("Evaluation Category")
plt.ylabel("Score")
figure = fig.get_figure()
figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400)
plt.close()
import io
import json
import os
import string
from typing import Dict
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tqdm
from zhon import hanzi
def _make_w_io_base(f, mode: str):
if not isinstance(f, io.IOBase):
f_dirname = os.path.dirname(f)
if f_dirname != "":
os.makedirs(f_dirname, exist_ok=True)
f = open(f, mode=mode)
return f
def _make_r_io_base(f, mode: str):
if not isinstance(f, io.IOBase):
f = open(f, mode=mode)
return f
def jdump(obj, f, mode="w", indent=4, default=str):
"""Dump a str or dictionary to a file in json format.
Args:
obj: An object to be written.
f: A string path to the location on disk.
mode: Mode for opening the file.
indent: Indent for storing json dictionaries.
default: A function to handle non-serializable entries; defaults to `str`.
"""
f = _make_w_io_base(f, mode)
if isinstance(obj, (dict, list)):
json.dump(obj, f, indent=indent, default=default, ensure_ascii=False)
elif isinstance(obj, str):
f.write(obj)
else:
raise ValueError(f"Unexpected type: {type(obj)}")
f.close()
def jload(f, mode="r"):
"""Load a .json file into a dictionary."""
f = _make_r_io_base(f, mode)
jdict = json.load(f)
f.close()
return jdict
def get_json_list(file_path):
with open(file_path, "r") as f:
json_list = []
for line in f:
json_list.append(json.loads(line))
return json_list
def get_data_per_category(data, categories):
data_per_category = {category: [] for category in categories}
for item in data:
category = item["category"]
if category in categories:
data_per_category[category].append(item)
return data_per_category
def remove_punctuations(text: str) -> str:
"""
Remove punctuations in the given text.
It is used in evaluation of automatic metrics.
"""
punctuation = string.punctuation + hanzi.punctuation
punctuation = set([char for char in punctuation])
punctuation.difference_update(set("!@#$%&()<>?|,.\"'"))
out = []
for char in text:
if char in punctuation:
continue
else:
out.append(char)
return "".join(out)
def remove_redundant_space(text: str) -> str:
"""
Remove redundant spaces in the given text.
It is used in evaluation of automatic metrics.
"""
return " ".join(text.split())
def preprocessing_text(text: str) -> str:
"""
Preprocess the given text.
It is used in evaluation of automatic metrics.
"""
return remove_redundant_space(remove_punctuations(text.lower()))
def save_automatic_results(model_name: str, automatic_metric_stats: Dict[str, Dict], save_path: str) -> None:
"""
Save automatic evaluation results of different categories for one model.
"""
if not os.path.exists(save_path):
os.makedirs(save_path)
automatic_df = pd.DataFrame(automatic_metric_stats)
automatic_df.to_csv(os.path.join(save_path, f"{model_name}_results.csv"), index=True)
def read_automatic_results(results_path: str, file_name: str) -> Dict[str, Dict]:
"""
Read a csv file and return a dictionary which stores scores per metric.
"""
results = pd.read_csv(os.path.join(results_path, file_name), index_col=0)
results_dict = {metric: {} for metric in list(results.index)}
for i, metric in enumerate(results_dict.keys()):
for j, category in enumerate(list(results.columns)):
if pd.isnull(results.iloc[i][j]):
continue
results_dict[metric][category] = results.iloc[i][j]
return results_dict
def analyze_automatic_results(results_path: str, save_path: str) -> None:
"""
Analyze and visualize all csv files in the given folder.
"""
if not os.path.exists(results_path):
raise Exception(f'The given directory "{results_path}" doesn\'t exist! No results found!')
all_statistics = {}
for file_name in os.listdir(results_path):
if file_name.endswith("_results.csv"):
model_name = file_name.split("_results.csv")[0]
all_statistics[model_name] = read_automatic_results(results_path, file_name)
if len(list(all_statistics.keys())) == 0:
raise Exception(f'There are no csv files in the given directory "{results_path}"!')
frame_all = {"model": [], "category": [], "metric": [], "score": []}
frame_per_metric = {}
for model_name, model_statistics in all_statistics.items():
for metric, metric_statistics in model_statistics.items():
if frame_per_metric.get(metric) is None:
frame_per_metric[metric] = {"model": [], "category": [], "score": []}
for category, category_score in metric_statistics.items():
frame_all["model"].append(model_name)
frame_all["category"].append(category)
frame_all["metric"].append(metric)
frame_all["score"].append(category_score)
frame_per_metric[metric]["model"].append(model_name)
frame_per_metric[metric]["category"].append(category)
frame_per_metric[metric]["score"].append(category_score)
if not os.path.exists(save_path):
os.makedirs(save_path)
frame_all = pd.DataFrame(frame_all)
frame_all.to_csv(os.path.join(save_path, "automatic_evaluation_statistics.csv"))
for metric in tqdm.tqdm(
frame_per_metric.keys(),
desc=f"automatic metrics: ",
total=len(frame_per_metric.keys()),
):
data = pd.DataFrame(frame_per_metric[metric])
sns.set()
fig = plt.figure(figsize=(16, 10))
fig = sns.barplot(x="category", y="score", hue="model", data=data, dodge=True)
fig.set_title(f"Comparison between Different Models for Metric {metric.title()}")
plt.xlabel("Evaluation Category")
plt.ylabel("Score")
figure = fig.get_figure()
figure.savefig(os.path.join(save_path, f"{metric}.png"), dpi=400)
plt.close()
This diff is collapsed.
from .agieval import AGIEvalDataset
from .base import BaseDataset
from .ceval import CEvalDataset
from .cmmlu import CMMLUDataset
from .colossalai import ColossalDataset
from .gaokaobench import GaoKaoBenchDataset
from .longbench import LongBenchDataset
from .mmlu import MMLUDataset
__all__ = [
"AGIEvalDataset",
"BaseDataset",
"CEvalDataset",
"CMMLUDataset",
"GaoKaoBenchDataset",
"LongBenchDataset",
"MMLUDataset",
"ColossalDataset",
]
# Adapted from https://github.com/ruixiangcui/AGIEval/blob/main/src/dataset_loader.py.
import ast
import glob
import os
from copy import deepcopy
from typing import Dict, List
import pandas as pd
from colossal_eval.utils import get_json_list
from colossalai.logging import DistributedLogger
from .base import BaseDataset
# define the datasets
english_qa_datasets = [
"lsat-ar",
"lsat-lr",
"lsat-rc",
"logiqa-en",
"sat-math",
"sat-en",
"aqua-rat",
"sat-en-without-passage",
"gaokao-english",
]
chinese_qa_datasets = [
"logiqa-zh",
"jec-qa-kd",
"jec-qa-ca",
"gaokao-chinese",
"gaokao-geography",
"gaokao-history",
"gaokao-biology",
"gaokao-chemistry",
"gaokao-physics",
"gaokao-mathqa",
]
english_cloze_datasets = ["math"]
chinese_cloze_datasets = ["gaokao-mathcloze"]
multi_choice_datasets = ["jec-qa-kd", "jec-qa-ca", "gaokao-physics", "gaokao-mathqa"]
math_output_datasets = {"gaokao-mathcloze", "math"}
default_inference_kwargs = {
"calculate_loss": True,
"all_classes": None,
"language": "Chinese",
"pretrain": False,
"max_new_tokens": 32,
}
def get_prompt(line: Dict, dataset_name: str, logger: DistributedLogger) -> Dict:
"""Modified from https://github.com/microsoft/AGIEval/blob/main/src/dataset_loader.py#L190"""
try:
all_classes = None
passage = line["passage"] if line["passage"] is not None else ""
if dataset_name in english_qa_datasets:
option_string = "ABCDEFG"
count = len(line["options"])
input = (
"Question: "
+ line["question"]
+ " "
+ "Choose from the following options: "
+ " ".join(line["options"])
+ "\n"
+ "Answer: "
)
all_classes = list(option_string[0:count])
elif dataset_name in chinese_qa_datasets:
option_string = "ABCDEFG"
count = len(line["options"])
input = "问题:" + line["question"] + " " + "从以下选项中选择:" + " ".join(line["options"]) + "\n" + "答案:"
all_classes = list(option_string[0:count])
elif dataset_name in english_cloze_datasets:
input = "Question: " + line["question"] + "\n" + "Answer: "
elif dataset_name in chinese_cloze_datasets:
input = "问题:" + line["question"] + "\n" + "答案:"
return {
"instruction": input if not passage else passage + "\n\n" + input,
"target": line["label"] if line["label"] else line["answer"],
}, all_classes
except NameError:
logger.info("Dataset not defined.")
# process few-shot raw_prompts
def combine_prompt(prompt_path, dataset_name, load_explanation=True, chat_mode=False):
skip_passage = False
if dataset_name == "sat-en-without-passage":
skip_passage = True
dataset_name = "sat-en"
demostrations = []
# read the prompts by context and explanation
context_row = [0, 1, 3, 5, 7, 9]
explanation_row = [0, 2, 4, 6, 8, 10]
raw_prompts_context = pd.read_csv(
prompt_path, header=0, skiprows=lambda x: x not in context_row, keep_default_na=False
)
raw_prompts_explanation = pd.read_csv(
prompt_path, header=0, skiprows=lambda x: x not in explanation_row, keep_default_na=False
).replace(r"\n\n", "\n", regex=True)
contexts = []
for line in list(raw_prompts_context[dataset_name]):
if line:
# print(line)
contexts.append(ast.literal_eval(line))
explanations = [exp for exp in raw_prompts_explanation[dataset_name] if exp]
for idx, (con, exp) in enumerate(zip(contexts, explanations)):
passage = con["passage"] if con["passage"] is not None and not skip_passage else ""
question = con["question"]
options = con["options"] if con["options"] is not None else ""
label = con["label"] if con["label"] is not None else ""
answer = con["answer"] if "answer" in con and con["answer"] is not None else ""
if dataset_name in english_qa_datasets:
question_input = (
"Question: "
+ passage
+ " "
+ question
+ "\n"
+ "Choose from the following options: "
+ " ".join(options)
+ "\n"
+ "Answer: {}".format(label)
)
elif dataset_name in chinese_qa_datasets:
question_input = (
"问题:" + passage + " " + question + "\n" + "从以下选项中选择:" + " ".join(options) + "\n" + "答案:{}".format(label)
)
elif dataset_name in english_cloze_datasets:
question_input = "Question: ".format(idx + 1) + question + "\n" + "Answer: {}".format(answer)
elif dataset_name in chinese_cloze_datasets:
question_input = "问题:" + question + "\n" + "答案:{}".format(answer)
else:
raise ValueError(f"During loading few-sot examples, found unknown dataset: {dataset_name}")
if chat_mode:
demostrations.append((question_input,))
else:
demostrations.append(question_input + "\n")
return demostrations
class AGIEvalDataset(BaseDataset):
"""
Dataset wrapper for AGIEval dataset.
Data source: https://github.com/microsoft/AGIEval
This dataset class will convert the original dataset into the inference dataset.
A few dirty data needed to be manually corrected in the origin dataset:
Issue link: https://github.com/microsoft/AGIEval/issues/16
1. Invalid options in line 190 in gaokao-chemistry.jsonl.
2. Option D (They may increase in value as those same resources become rare on Earth.) missing in line 17 in sat-en-without-passage.jsonl.
3. Option D (They may increase in value as those same resources become rare on Earth.) missing in line 17 in sat-en.jsonl.
4. Option D (No, because the data do not indicate whether the honeybees had been infected with mites.) missing in line 57 in sat-en-without-passage.jsonl.
5. Option D (No, because the data do not indicate whether the honeybees had been infected with mites.) missing in line 57 in sat-en.jsonl.
6. Option D (Published theories of scientists who developed earlier models of the Venus flytrap) missing in line 98 in sat-en-without-passage.jsonl.
7. Option D (Published theories of scientists who developed earlier models of the Venus flytrap) missing in line 98 in sat-en.jsonl.
8. Label is empty in line 212 in jec-qa-kd.jsonl. Content is also dirty.
9. Actually, gaokao-mathqa.jsonl is also a multi-choice dataset. See line 149 286 287.
"""
@staticmethod
def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
dataset = {"test": {}}
files = glob.glob(os.path.join(path, "*.jsonl"))
files.sort()
if few_shot:
prompt_path = os.path.join(path, "few_shot_prompts.csv")
for file in files:
dataset_name = os.path.basename(file)[0 : -len(".jsonl")]
few_shot_data = []
if few_shot:
# process demo once if it is few-shot-CoT
few_shot_data = combine_prompt(prompt_path, dataset_name, load_explanation=False, chat_mode=False)
dataset["test"][dataset_name] = {"data": []}
file_dir = os.path.join(path, file)
loaded_jsonl = get_json_list(file_dir)
# It's been tested that each data sample in one subcategory have same inference arguments.
_, all_classes = get_prompt(loaded_jsonl[0], dataset_name, logger)
inference_kwargs = deepcopy(default_inference_kwargs)
if all_classes is not None and dataset_name not in multi_choice_datasets:
inference_kwargs["all_classes"] = all_classes
if dataset_name in english_qa_datasets:
inference_kwargs["language"] = "English"
if dataset_name in chinese_qa_datasets:
inference_kwargs["language"] = "Chinese"
inference_kwargs["few_shot_data"] = few_shot_data
dataset["test"][dataset_name]["inference_kwargs"] = inference_kwargs
for line in loaded_jsonl:
info, all_classes = get_prompt(line, dataset_name, logger)
# Convert multi-choice answers to a single string.
# We will convert it back when evaluating.
# We do this because if target is a list, it should be only used for multiple target answers.
if dataset_name in multi_choice_datasets:
if isinstance(info["target"], str) and len(info["target"]) > 1:
# "gaokao-mathqa" actually contain multi-choice questions.
# This if clause is specially used for it.
info["target"] = "".join(info["target"].split())
else:
info["target"] = "".join(info["target"])
if isinstance(info["target"], list) and len(info["target"]) == 1:
info["target"] = info["target"][0]
data_sample = {
"dataset": "agieval",
"split": "test",
"category": dataset_name,
"instruction": info["instruction"],
"input": "",
"output": "",
"target": info["target"],
}
dataset["test"][dataset_name]["data"].append(data_sample)
return dataset
from abc import abstractstaticmethod
from colossal_eval.utils import jdump
class BaseDataset:
"""
Base class for dataset wrapper.
Args:
path: The path to the original dataset.
logger: Logger for the dataset.
"""
def __init__(self, path, logger, few_shot):
self.dataset = self.load(path, logger, few_shot)
def save(self, save_path):
"""Save the converted dataset"""
jdump(self.dataset, save_path)
@abstractstaticmethod
def load(path, logger):
"""Load the original dataset and convert it into the inference dataset"""
import copy
import csv
import os
from typing import Dict, List
from colossalai.logging import DistributedLogger
from .base import BaseDataset
ceval_subject_mapping = {
"computer_network": ["Computer Network", "计算机网络", "STEM"],
"operating_system": ["Operating System", "操作系统", "STEM"],
"computer_architecture": ["Computer Architecture", "计算机组成", "STEM"],
"college_programming": ["College Programming", "大学编程", "STEM"],
"college_physics": ["College Physics", "大学物理", "STEM"],
"college_chemistry": ["College Chemistry", "大学化学", "STEM"],
"advanced_mathematics": ["Advanced Mathematics", "高等数学", "STEM"],
"probability_and_statistics": ["Probability and Statistics", "概率统计", "STEM"],
"discrete_mathematics": ["Discrete Mathematics", "离散数学", "STEM"],
"electrical_engineer": ["Electrical Engineer", "注册电气工程师", "STEM"],
"metrology_engineer": ["Metrology Engineer", "注册计量师", "STEM"],
"high_school_mathematics": ["High School Mathematics", "高中数学", "STEM"],
"high_school_physics": ["High School Physics", "高中物理", "STEM"],
"high_school_chemistry": ["High School Chemistry", "高中化学", "STEM"],
"high_school_biology": ["High School Biology", "高中生物", "STEM"],
"middle_school_mathematics": ["Middle School Mathematics", "初中数学", "STEM"],
"middle_school_biology": ["Middle School Biology", "初中生物", "STEM"],
"middle_school_physics": ["Middle School Physics", "初中物理", "STEM"],
"middle_school_chemistry": ["Middle School Chemistry", "初中化学", "STEM"],
"veterinary_medicine": ["Veterinary Medicine", "兽医学", "STEM"],
"college_economics": ["College Economics", "大学经济学", "Social Science"],
"business_administration": ["Business Administration", "工商管理", "Social Science"],
"marxism": ["Marxism", "马克思主义基本原理", "Social Science"],
"mao_zedong_thought": ["Mao Zedong Thought", "毛泽东思想和中国特色社会主义理论体系概论", "Social Science"],
"education_science": ["Education Science", "教育学", "Social Science"],
"teacher_qualification": ["Teacher Qualification", "教师资格", "Social Science"],
"high_school_politics": ["High School Politics", "高中政治", "Social Science"],
"high_school_geography": ["High School Geography", "高中地理", "Social Science"],
"middle_school_politics": ["Middle School Politics", "初中政治", "Social Science"],
"middle_school_geography": ["Middle School Geography", "初中地理", "Social Science"],
"modern_chinese_history": ["Modern Chinese History", "近代史纲要", "Humanities"],
"ideological_and_moral_cultivation": ["Ideological and Moral Cultivation", "思想道德修养与法律基础", "Humanities"],
"logic": ["Logic", "逻辑学", "Humanities"],
"law": ["Law", "法学", "Humanities"],
"chinese_language_and_literature": ["Chinese Language and Literature", "中国语言文学", "Humanities"],
"art_studies": ["Art Studies", "艺术学", "Humanities"],
"professional_tour_guide": ["Professional Tour Guide", "导游资格", "Humanities"],
"legal_professional": ["Legal Professional", "法律职业资格", "Humanities"],
"high_school_chinese": ["High School Chinese", "高中语文", "Humanities"],
"high_school_history": ["High School History", "高中历史", "Humanities"],
"middle_school_history": ["Middle School History", "初中历史", "Humanities"],
"civil_servant": ["Civil Servant", "公务员", "Other"],
"sports_science": ["Sports Science", "体育学", "Other"],
"plant_protection": ["Plant Protection", "植物保护", "Other"],
"basic_medicine": ["Basic Medicine", "基础医学", "Other"],
"clinical_medicine": ["Clinical Medicine", "临床医学", "Other"],
"urban_and_rural_planner": ["Urban and Rural Planner", "注册城乡规划师", "Other"],
"accountant": ["Accountant", "注册会计师", "Other"],
"fire_engineer": ["Fire Engineer", "注册消防工程师", "Other"],
"environmental_impact_assessment_engineer": ["Environmental Impact Assessment Engineer", "环境影响评价工程师", "Other"],
"tax_accountant": ["Tax Accountant", "税务师", "Other"],
"physician": ["Physician", "医师资格", "Other"],
}
default_inference_kwargs = {
"calculate_loss": False,
"all_classes": ["A", "B", "C", "D"],
"language": "Chinese",
"pretrain": False,
"max_new_tokens": 32,
}
def get_few_shot_data(data: List[Dict]):
few_shot_data = []
for i in data:
few_shot_data.append(i["input"] + i["target"])
return few_shot_data
class CEvalDataset(BaseDataset):
"""
Dataset class for CEval dataset.
Data source: https://huggingface.co/datasets/ceval/ceval-exam
This dataset class will convert the original dataset into the inference dataset.
"""
@staticmethod
def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
dataset = {"dev": {}, "test": {}}
for split in ["dev", "test"]:
files = os.listdir(os.path.join(path, split))
files.sort()
for file in files:
subject = file[0 : -len(f"_{split}.csv")]
subject = ceval_subject_mapping[subject][1]
file_dir = os.path.join(path, split, file)
dataset[split][subject] = {"data": []}
# It's been tested that each data sample in one subcategory have same inference arguments.
dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
if split == "test" and few_shot:
dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
dataset["dev"][subject]["data"]
)
with open(file_dir, encoding="utf-8") as f:
reader = csv.reader(f)
_ = next(reader)
for row in reader:
# Dev split have answer and explanation so len(row) is 8
# But test split doesn't contain answer and explanation, so len(row) is 6
assert len(row) >= 6
choices = f"A. {row[2]}\nB. {row[3]}\nC. {row[4]}\nD. {row[5]}"
data_sample = {
"dataset": "ceval",
"split": split,
"category": subject,
"instruction": f"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。",
"input": f"题目:{row[1]}\n{choices}\n答案:",
"output": "",
"target": row[6] if split == "dev" else "",
"id": int(row[0]),
}
dataset[split][subject]["data"].append(data_sample)
return dataset
import copy
import csv
import os
from typing import Dict, List
from colossalai.logging import DistributedLogger
from .base import BaseDataset
cmmlu_subject_mapping = {
"agronomy": "农学",
"anatomy": "解剖学",
"ancient_chinese": "古汉语",
"arts": "艺术学",
"astronomy": "天文学",
"business_ethics": "商业伦理",
"chinese_civil_service_exam": "中国公务员考试",
"chinese_driving_rule": "中国驾驶规则",
"chinese_food_culture": "中国饮食文化",
"chinese_foreign_policy": "中国外交政策",
"chinese_history": "中国历史",
"chinese_literature": "中国文学",
"chinese_teacher_qualification": "中国教师资格",
"clinical_knowledge": "临床知识",
"college_actuarial_science": "大学精算学",
"college_education": "大学教育学",
"college_engineering_hydrology": "大学工程水文学",
"college_law": "大学法律",
"college_mathematics": "大学数学",
"college_medical_statistics": "大学医学统计",
"college_medicine": "大学医学",
"computer_science": "计算机科学",
"computer_security": "计算机安全",
"conceptual_physics": "概念物理学",
"construction_project_management": "建设工程管理",
"economics": "经济学",
"education": "教育学",
"electrical_engineering": "电气工程",
"elementary_chinese": "小学语文",
"elementary_commonsense": "小学常识",
"elementary_information_and_technology": "小学信息技术",
"elementary_mathematics": "初等数学",
"ethnology": "民族学",
"food_science": "食品科学",
"genetics": "遗传学",
"global_facts": "全球事实",
"high_school_biology": "高中生物",
"high_school_chemistry": "高中化学",
"high_school_geography": "高中地理",
"high_school_mathematics": "高中数学",
"high_school_physics": "高中物理学",
"high_school_politics": "高中政治",
"human_sexuality": "人类性行为",
"international_law": "国际法学",
"journalism": "新闻学",
"jurisprudence": "法理学",
"legal_and_moral_basis": "法律与道德基础",
"logical": "逻辑学",
"machine_learning": "机器学习",
"management": "管理学",
"marketing": "市场营销",
"marxist_theory": "马克思主义理论",
"modern_chinese": "现代汉语",
"nutrition": "营养学",
"philosophy": "哲学",
"professional_accounting": "专业会计",
"professional_law": "专业法学",
"professional_medicine": "专业医学",
"professional_psychology": "专业心理学",
"public_relations": "公共关系",
"security_study": "安全研究",
"sociology": "社会学",
"sports_science": "体育学",
"traditional_chinese_medicine": "中医中药",
"virology": "病毒学",
"world_history": "世界历史",
"world_religions": "世界宗教",
}
default_inference_kwargs = {
"calculate_loss": True,
"all_classes": ["A", "B", "C", "D"],
"language": "Chinese",
"pretrain": False,
"max_new_tokens": 32,
}
def get_few_shot_data(data: List[Dict]):
few_shot_data = []
for i in data:
few_shot_data.append(i["input"] + i["target"])
return few_shot_data
class CMMLUDataset(BaseDataset):
"""
Dataset class for CMMLU dataset.
Data source: https://github.com/haonan-li/CMMLU/tree/master/data
This dataset class will convert the original dataset into the inference dataset.
"""
@staticmethod
def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
dataset = {"dev": {}, "test": {}}
for split in ["dev", "test"]:
files = os.listdir(os.path.join(path, split))
files.sort()
for file in files:
subject = file[0 : -len(".csv")]
subject = cmmlu_subject_mapping[subject]
file_dir = os.path.join(path, split, file)
dataset[split][subject] = {"data": []}
# It's been tested that each data sample in one subcategory have same inference arguments.
dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
if split == "test" and few_shot:
dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
dataset["dev"][subject]["data"]
)
with open(file_dir, encoding="utf-8") as f:
reader = csv.reader(f)
_ = next(reader)
for row in reader:
assert len(row) == 7
choices = f"A. {row[2]}\nB. {row[3]}\nC. {row[4]}\nD. {row[5]}"
data_sample = {
"dataset": "cmmlu",
"split": split,
"category": subject,
"instruction": f"以下是关于{subject}的单项选择题,请直接给出正确答案的选项。",
"input": f"题目:{row[1]}\n{choices}\n答案:",
"output": "",
"target": row[6],
}
dataset[split][subject]["data"].append(data_sample)
return dataset
from collections import defaultdict
from copy import deepcopy
from typing import Dict, List
from colossal_eval.utils import jload
from colossalai.logging import DistributedLogger
from .base import BaseDataset
default_inference_kwargs = {
"calculate_loss": False,
"all_classes": None,
"language": "Chinese",
"pretrain": False,
"max_new_tokens": 256,
}
# You can add your own subcategory questions and specify whether it is a single-choice question or has target answers and need to calculate loss.
single_choice_question = set()
calculate_loss = set()
def get_data_per_category(data):
data_per_category = defaultdict(list)
for item in data:
category = item["category"]
data_per_category[category].append(item)
return data_per_category
class ColossalDataset(BaseDataset):
"""
Dataset class for Colossal dataset.
This dataset class will convert the original dataset into the inference dataset.
"""
@staticmethod
def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
dataset = {"test": {}}
data = jload(path)
data_per_category = get_data_per_category(data)
categories = list(data_per_category.keys())
for category in categories:
dataset["test"][category] = {"data": []}
category_data = data_per_category[category]
dataset["test"][category]["inference_kwargs"] = deepcopy(default_inference_kwargs)
if category in calculate_loss:
dataset["test"][category]["inference_kwargs"]["calculate_loss"] = True
if category in single_choice_question:
dataset["test"][category]["inference_kwargs"]["all_classes"] = ["A", "B", "C", "D"]
for item in category_data:
data_sample = {
"dataset": "colossal",
"split": "test",
"category": category,
"instruction": item["instruction"],
"input": item["input"],
"output": "",
"target": item["target"],
"id": item["id"],
}
dataset["test"][category]["data"].append(data_sample)
return dataset
import json
import os
import re
from copy import deepcopy
from typing import Dict, List
from colossalai.logging import DistributedLogger
from .base import BaseDataset
multi_choice_datasets = [
"Chinese Lang and Usage MCQs",
"Chinese Modern Lit",
"English Fill in Blanks",
"English Reading Comp",
"Geography MCQs",
"Physics MCQs",
"English Cloze Test",
]
chinese_qa_datasets = [
"Biology MCQs",
"Chemistry MCQs",
"Chinese Lang and Usage MCQs",
"Chinese Modern Lit",
"Geography MCQs",
"History MCQs",
"Math I MCQs",
"Math II MCQs",
"Physics MCQs",
"Political Science MCQs",
]
english_qa_datasets = ["English MCQs", "English Fill in Blanks", "English Reading Comp", "English Cloze Test"]
default_inference_kwargs = {
"calculate_loss": True,
"all_classes": None,
"language": "Chinese",
"pretrain": False,
"max_new_tokens": 32,
}
def get_all_classes(instruction: str):
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
pattern = r"([A-Z]\. |[A-Z].|[A-Z]\.)"
options = sorted(list(set(re.findall(pattern, instruction))))
options = sorted(list(set([string[0] for string in options])))
for i in range(len(options)):
if options[i] == letters[i]:
continue
else:
return options[0:i]
return options
class GaoKaoBenchDataset(BaseDataset):
"""
Dataset class for GAOKAO-Bench dataset.
Data source: https://github.com/OpenLMLab/GAOKAO-Bench/tree/main/data
This dataset class will convert the original dataset into the inference dataset.
A few typos needed to be manually corrected in the origin dataset, some of the following is fixed.
Issue link: https://github.com/OpenLMLab/GAOKAO-Bench/issues/20
1. Option C missing in index 111 in 2010-2022_Chemistry_MCQs.json
2. Option B missing "." after it in index 16 in 2012-2022_English_Cloze_Test.json
3. Option G missing "." after it in index 23 in 2012-2022_English_Cloze_Test.json
"""
@staticmethod
def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
dataset = {"test": {}}
for category in ["Fill-in-the-blank_Questions", "Multiple-choice_Questions", "Open-ended_Questions"]:
files = os.listdir(os.path.join(path, "data", category))
files.sort()
for file in files:
subject = file[10:-5].split("_")
subject = " ".join(subject)
dataset["test"][subject] = {"data": []}
file_dir = os.path.join(path, "data", category, file)
with open(file_dir, encoding="utf-8") as f:
data = json.load(f)
# It's been tested that each data sample in one subcategory have same inference arguments.
inference_kwargs = deepcopy(default_inference_kwargs)
if category == "Multiple-choice_Questions" and subject not in multi_choice_datasets:
all_classes = get_all_classes(data["example"][0]["question"])
inference_kwargs["all_classes"] = all_classes
if subject in english_qa_datasets:
inference_kwargs["language"] = "English"
if subject in chinese_qa_datasets:
inference_kwargs["language"] = "Chinese"
dataset["test"][subject]["inference_kwargs"] = inference_kwargs
for sample in data["example"]:
# Convert multi-choice answers to a single string.
# We will convert it back when evaluating.
# We do this because if target is a list, it should be only used for multiple target answers.
if subject in multi_choice_datasets:
sample["answer"] = "".join(sample["answer"])
if isinstance(sample["answer"], list) and len(sample["answer"]) == 1:
sample["answer"] = sample["answer"][0]
data_sample = {
"dataset": "gaokaobench",
"split": "test",
"category": f"{category[:-10]}-{subject}",
"instruction": sample["question"].strip() + "\n答案:",
"input": "",
"output": "",
"target": sample["answer"],
}
dataset["test"][subject]["data"].append(data_sample)
return dataset
import os
from copy import deepcopy
from typing import Dict, List
from colossal_eval.utils import get_json_list
from colossalai.logging import DistributedLogger
from .base import BaseDataset
dataset2prompt = {
"narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
"qasper": 'You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:',
"multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
"multifieldqa_zh": "阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:",
"hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
"2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
"musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
"dureader": "请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:",
"gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:",
"qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:",
"multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:",
"vcsum": "下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:",
"trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}",
"triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}",
"samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}",
"lsht": "请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}",
"passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ",
"passage_retrieval_en": 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ',
"passage_retrieval_zh": '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:',
"lcc": "Please complete the code given below. \n{context}Next line of code:\n",
"repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n",
}
dataset2maxlen = {
"narrativeqa": 128,
"qasper": 128,
"multifieldqa_en": 64,
"multifieldqa_zh": 64,
"hotpotqa": 32,
"2wikimqa": 32,
"musique": 32,
"dureader": 128,
"gov_report": 512,
"qmsum": 512,
"multi_news": 512,
"vcsum": 512,
"trec": 64,
"triviaqa": 32,
"samsum": 128,
"lsht": 64,
"passage_count": 32,
"passage_retrieval_en": 32,
"passage_retrieval_zh": 32,
"lcc": 64,
"repobench-p": 64,
}
default_inference_kwargs = {
"calculate_loss": True,
"all_classes": None,
"language": "Chinese",
"pretrain": False,
"max_new_tokens": 32,
}
class LongBenchDataset(BaseDataset):
"""
Dataset class for LongBench dataset.
Data source: https://huggingface.co/datasets/THUDM/LongBench
This dataset class will convert the original dataset into the inference dataset.
Issue link: https://github.com/THUDM/LongBench/issues/15 (fixed)
There are duplicate target answers in `nq.jsonl`, but this doesn't affect evaluation results.
Also doesn't affect perplexity calculation (the program only need to select the minimum loss).
"""
@staticmethod
def load(path: str, logger: DistributedLogger) -> List[Dict]:
dataset = {"test": {}}
files = os.listdir(path)
files.sort()
for file in files:
category = file[0:-6]
if category.endswith("_e"):
continue
dataset["test"][category] = {"data": []}
file_dir = os.path.join(path, file)
loaded_jsonl = get_json_list(file_dir)
# It's been tested that each data sample in one subcategory have same inference arguments.
inference_kwargs = deepcopy(default_inference_kwargs)
if loaded_jsonl[0]["all_classes"] is not None:
inference_kwargs["all_classes"] = loaded_jsonl[0]["all_classes"]
inference_kwargs["max_new_tokens"] = dataset2maxlen[category]
dataset["test"][category]["inference_kwargs"] = inference_kwargs
for sample in loaded_jsonl:
prompt = dataset2prompt[category].format(**sample)
data_sample = {
"dataset": "longbench",
"split": "test",
"category": category,
"instruction": prompt,
"input": "",
"output": "",
"target": sample["answers"],
}
dataset["test"][category]["data"].append(data_sample)
return dataset
import copy
import csv
import os
from typing import Dict, List
from colossalai.logging import DistributedLogger
from .base import BaseDataset
default_inference_kwargs = {
"calculate_loss": True,
"all_classes": ["A", "B", "C", "D"],
"language": "English",
"pretrain": False,
"max_new_tokens": 32,
}
def get_few_shot_data(data: List[Dict]):
few_shot_data = []
for i in data:
few_shot_data.append(i["input"] + i["target"])
return few_shot_data
class MMLUDataset(BaseDataset):
"""
Dataset class for MMLU dataset.
Data source: https://github.com/hendrycks/test
This dataset class will convert the original dataset into the inference dataset.
"""
@staticmethod
def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
dataset = {"dev": {}, "test": {}}
for split in ["dev", "test"]:
files = os.listdir(os.path.join(path, split))
files.sort()
for file in files:
subject = file[0 : -len(f"_{split}.csv")].split("_")
subject = " ".join([word.title() if word != "us" else "US" for word in subject])
file_dir = os.path.join(path, split, file)
dataset[split][subject] = {"data": [], "inference_kwargs": {}}
# It's been tested that each data sample in one subcategory have same inference arguments.
dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
if split == "test" and few_shot:
dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data(
dataset["dev"][subject]["data"]
)
with open(file_dir, encoding="utf-8") as f:
reader = csv.reader(f)
for row in reader:
assert len(row) == 6
choices = f"A. {row[1]}\nB. {row[2]}\nC. {row[3]}\nD. {row[4]}"
data_sample = {
"dataset": "mmlu",
"split": split,
"category": subject,
"instruction": f"The following is a single-choice question on {subject}. Answer the question by replying A, B, C or D.",
"input": f"Question: {row[0]}\n{choices}\nAnswer: ",
"output": "",
"target": row[5],
}
dataset[split][subject]["data"].append(data_sample)
return dataset
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment