Unverified Commit 2506e275 authored by Yuanchen's avatar Yuanchen Committed by GitHub
Browse files

[evaluation] improvement on evaluation (#3862)



* fix a bug when the config file contains one category but the answer file doesn't contains that category

* fix Chinese prompt file

* support gpt-3.5-turbo and gpt-4 evaluation

* polish and update README

* resolve pr comments

---------
Co-authored-by: default avatarYuanchen Xu <yuanchen.xu00@gmail.com>
parent b0474878
This diff is collapsed.
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
"language": "cn", "language": "cn",
"category": { "category": {
"brainstorming": { "brainstorming": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"creativity", "creativity",
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
] ]
}, },
"chat": { "chat": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"naturalness", "naturalness",
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
] ]
}, },
"classification": { "classification": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"correctness" "correctness"
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
] ]
}, },
"closed_qa": { "closed_qa": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"correctness" "correctness"
...@@ -50,7 +50,7 @@ ...@@ -50,7 +50,7 @@
] ]
}, },
"extraction": { "extraction": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"correctness" "correctness"
...@@ -62,7 +62,7 @@ ...@@ -62,7 +62,7 @@
] ]
}, },
"generation": { "generation": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"diversity" "diversity"
...@@ -74,7 +74,7 @@ ...@@ -74,7 +74,7 @@
] ]
}, },
"open_qa": { "open_qa": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"correctness" "correctness"
...@@ -84,7 +84,7 @@ ...@@ -84,7 +84,7 @@
] ]
}, },
"rewriting": { "rewriting": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"correctness" "correctness"
...@@ -96,7 +96,7 @@ ...@@ -96,7 +96,7 @@
] ]
}, },
"roleplay": { "roleplay": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"fidelity", "fidelity",
...@@ -107,7 +107,7 @@ ...@@ -107,7 +107,7 @@
] ]
}, },
"summarization": { "summarization": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"correctness", "correctness",
......
...@@ -39,7 +39,8 @@ def main(args): ...@@ -39,7 +39,8 @@ def main(args):
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!") "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
# initialize evaluator # initialize evaluator
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt) evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
config["language"])
if len(args.model_name_list) == 2: if len(args.model_name_list) == 2:
answers1 = jload(args.answer_file_list[0]) answers1 = jload(args.answer_file_list[0])
answers2 = jload(args.answer_file_list[1]) answers2 = jload(args.answer_file_list[1])
...@@ -87,6 +88,10 @@ if __name__ == '__main__': ...@@ -87,6 +88,10 @@ if __name__ == '__main__':
default=[], default=[],
required=True, required=True,
help='the names of at most 2 models') help='the names of at most 2 models')
parser.add_argument('--gpt_model',
default="gpt-3.5-turbo",
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
help='which GPT model to use for evaluation')
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results') parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key') parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
args = parser.parse_args() args = parser.parse_args()
......
...@@ -14,13 +14,15 @@ class Evaluator(object): ...@@ -14,13 +14,15 @@ class Evaluator(object):
""" """
def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
Any]) -> None: gpt_model: str, language: str) -> None:
self.params = params self.params = params
self.battle_prompt = battle_prompt self.battle_prompt = battle_prompt
self.gpt_evaluation_prompt = gpt_evaluation_prompt self.gpt_evaluation_prompt = gpt_evaluation_prompt
self.gpt_model = gpt_model
self.language = language
self.automatic_metric_stats = dict() self.automatic_metric_stats = dict()
self.gpt35_evaluation_results = dict() self.gpt_evaluation_results = dict()
self.battle_results = [] self.battle_results = []
def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None: def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
...@@ -63,6 +65,10 @@ class Evaluator(object): ...@@ -63,6 +65,10 @@ class Evaluator(object):
# automatic evaluation # automatic evaluation
for category in self.params: for category in self.params:
if len(answers_per_category[category]) == 0:
print(f"Category {category} specified in your config doesn't have corresponding answers!")
continue
category_metrics = self.params[category]["Metrics"] category_metrics = self.params[category]["Metrics"]
self.automatic_metric_stats[category] = {} self.automatic_metric_stats[category] = {}
...@@ -74,17 +80,21 @@ class Evaluator(object): ...@@ -74,17 +80,21 @@ class Evaluator(object):
for metric in category_metrics: for metric in category_metrics:
self.automatic_metric_stats[category].update(switch(metric=metric)) self.automatic_metric_stats[category].update(switch(metric=metric))
# gpt35 evaluation # gpt evaluation
for category in self.params: for category in self.params:
category_metrics = self.params[category]["GPT-3.5"] if len(answers_per_category[category]) == 0:
print(f"Category {category} specified in your config doesn't have corresponding answers!")
continue
category_metrics = self.params[category]["GPT"]
prompt = self.gpt_evaluation_prompt.get(category, None) prompt = self.gpt_evaluation_prompt.get(category, None)
if prompt is None: if prompt is None:
print(f"No prompt for category {category}! Use prompt for category general now.") print(f"No prompt for category {category}! Use prompt for category general now.")
prompt = self.gpt_evaluation_prompt["general"] prompt = self.gpt_evaluation_prompt["general"]
self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category], self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(answers_per_category[category], prompt,
prompt, category_metrics, category) category_metrics, category, self.gpt_model)
def save(self, path: str, model_name_list: List[str]) -> None: def save(self, path: str, model_name_list: List[str]) -> None:
""" """
...@@ -106,10 +116,10 @@ class Evaluator(object): ...@@ -106,10 +116,10 @@ class Evaluator(object):
# Save evaluation results for GPT-3.5 evaluation metrics. # Save evaluation results for GPT-3.5 evaluation metrics.
all_evaluations = [] all_evaluations = []
base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results") base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results") evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
for category, evaluations in self.gpt35_evaluation_results.items(): for category, evaluations in self.gpt_evaluation_results.items():
jdump( jdump(
evaluations, evaluations,
os.path.join(evaluation_results_save_path, model_name_list[0], os.path.join(evaluation_results_save_path, model_name_list[0],
...@@ -121,10 +131,10 @@ class Evaluator(object): ...@@ -121,10 +131,10 @@ class Evaluator(object):
# Start to calculate scores and save statistics. # Start to calculate scores and save statistics.
evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics") evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations, gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
evaluation_statistics_save_path) evaluation_statistics_save_path)
# Save charts and csv. # Save charts and csv.
evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses") evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path, gpt_evaluate.analyze_gpt_evaluation_statistics(evaluation_statistics_save_path,
evaluation_analyses_save_path) evaluation_analyses_save_path)
...@@ -16,7 +16,7 @@ from utils import jdump, jload ...@@ -16,7 +16,7 @@ from utils import jdump, jload
def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]: def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
""" """
Get evaluation from GPT-4. Get battle evaluation from GPT-4.
Args: Args:
sys_prompt: prompt for the system. sys_prompt: prompt for the system.
...@@ -51,7 +51,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in ...@@ -51,7 +51,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in
except Exception as e: except Exception as e:
print(e) print(e)
time.sleep(1) time.sleep(1)
print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.") print(f"Evaluation {id} failed after {MAX_API_RETRY} retries.")
return {"evaluation": "", "id": id} return {"evaluation": "", "id": id}
...@@ -233,12 +233,77 @@ def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_pa ...@@ -233,12 +233,77 @@ def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_pa
print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}") print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
def get_gpt35_evaluation(prompt: Dict[str, Any], def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
inst: Dict[str, Any], inst: Dict[str, Any],
metrics: List[str], metrics: List[str],
model: str = "gpt-3.5-turbo",
max_tokens: int = 2048) -> Dict[str, Any]: max_tokens: int = 2048) -> Dict[str, Any]:
""" """
Use GPT-3.5 to evaluate one model answer. Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
Args:
prompt: a dictionary including prompt template, CoT and metrics.
inst: the instruction that is needed to be evaluated.
metrics: the metrics for evaluation.
model: the model used to evaluate answers.
max_tokens: the maximum number of tokens to generate in the chat completion.
Returns:
An evaluation of one answer.
"""
MAX_API_RETRY = 3
question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
answer = inst["output"]
inst["evaluation"] = {}
for metric in metrics:
if prompt["metrics"].get(metric, None) is None:
raise Exception(
f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
)
for i in range(MAX_API_RETRY):
try:
response = openai.ChatCompletion.create(
model=model,
messages=[
{
"role":
"user",
"content":
prompt["prompt"].format(
question=question,
answer=answer,
metric=prompt["metrics"][metric],
steps=prompt["CoT"][metric],
),
},
],
temperature=0,
max_tokens=max_tokens,
)
inst["evaluation"][metric] = {
"response": response["choices"][0]["message"]["content"],
"logprobs": None,
}
break
except Exception as e:
print(e)
time.sleep(1)
if metric not in inst["evaluation"]:
print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
inst["evaluation"][metric] = {}
return inst
def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
max_tokens: int = 2048) -> Dict[str, Any]:
"""
Use completion model(text-davinci-003) to evaluate one model answer.
Only completion models can return log probabilities.
Args: Args:
prompt: a dictionary including prompt template, CoT and metrics. prompt: a dictionary including prompt template, CoT and metrics.
...@@ -283,23 +348,22 @@ def get_gpt35_evaluation(prompt: Dict[str, Any], ...@@ -283,23 +348,22 @@ def get_gpt35_evaluation(prompt: Dict[str, Any],
except Exception as e: except Exception as e:
print(e) print(e)
time.sleep(1) time.sleep(1)
if metric not in inst["evaluation"]:
print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
inst["evaluation"][metric] = {}
return inst return inst
def gpt35_evaluate( def evaluate(answers: List[Dict], prompt: Dict[str, Any], metrics: List[str], category: str, model: str) -> List[Dict]:
answers: List[Dict],
prompt: Dict[str, Any],
metrics: List[str],
category: str,
) -> List[Dict]:
""" """
Use GPT-3.5 to evaluate model answers and save evaluation results. Use GPT models to evaluate model answers and save evaluation results.
Args: Args:
answers: model answers. answers: model answers.
prompt: prompt for GPT-3.5 evaluation. prompt: prompt for GPT evaluation.
metrics: metrics for GPT-3.5 evaluation. metrics: metrics for GPT evaluation.
category: the category of the model answers for evaluation. category: the category of the model answers for evaluation.
model: the specific GPT model used to evaluate answers.
Returns: Returns:
Evaluations of the given answers. Evaluations of the given answers.
...@@ -315,7 +379,12 @@ def gpt35_evaluate( ...@@ -315,7 +379,12 @@ def gpt35_evaluate(
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = [] futures = []
for inst in answers: for inst in answers:
future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1) # Completion models can return log probabilities.
if model == "text-davinci-003":
future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
else:
future = executor.submit(get_gpt_evaluation_without_logprobs, prompt, inst, metrics, model, 1)
futures.append(future) futures.append(future)
for future in tqdm.tqdm( for future in tqdm.tqdm(
...@@ -334,20 +403,19 @@ def gpt35_evaluate( ...@@ -334,20 +403,19 @@ def gpt35_evaluate(
def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float: def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
""" """
Calculate score from log probabilities returned by text-davinci-003. Calculate the score according to log probabilities returned by text-davinci-003.
Only openai.Completion can return logprobs.
Calculation formula: Calculation formula:
score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability. score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.
Ref: https://arxiv.org/abs/2303.16634 Ref: https://arxiv.org/abs/2303.16634
This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling). This paper proposes NLG evaluation methods using text-davinci-003(log probabilities returned by completion models) and GPT-4(probabilities obtained by sampling).
Args: Args:
logprobs: logprobs returned by openai.Completion. logprobs: logprobs returned by openai.Completion.
Returns: Returns:
Score of one answer. The score of one answer.
""" """
# GPT-3.5 only returns score of 1 to 5. # GPT-3.5 only returns score of 1 to 5.
...@@ -369,7 +437,31 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float: ...@@ -369,7 +437,31 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
return score return score
def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None: def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
"""
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
Args:
response: logprobs returned by openai.Completion.
evaluation: the evaluation corresponds to the question.
Returns:
The score of one answer.
"""
try:
results = re.findall(r"\d", response)
if len(results) == 1:
return int(results[0])
else:
raise Exception(f"Invalid score pair. Got {evaluation}.")
except Exception as e:
return 0
def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
""" """
Generate statistics for one model. Generate statistics for one model.
...@@ -396,7 +488,15 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s ...@@ -396,7 +488,15 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
scores = {metric: [] for metric in metrics} scores = {metric: [] for metric in metrics}
for evaluation in data: for evaluation in data:
for metric in metrics: for metric in metrics:
scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0])) if evaluation["evaluation"][metric] == {}:
# This means after 3 retries, the server still returns an error and we set the score to 0.
scores[metric].append(0)
elif evaluation["evaluation"][metric]["logprobs"] is not None:
scores[metric].append(
calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
else:
scores[metric].append(
calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation))
statistics = {} statistics = {}
for metric in metrics: for metric in metrics:
...@@ -414,7 +514,7 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s ...@@ -414,7 +514,7 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
) )
def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None: def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
""" """
Analyze and visualize all GPT-3.5 evaluation statistics in the given directory. Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
...@@ -474,7 +574,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> ...@@ -474,7 +574,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->
os.makedirs(save_path) os.makedirs(save_path)
frame_all = pd.DataFrame(frame_all) frame_all = pd.DataFrame(frame_all)
frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv")) frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))
for category in tqdm.tqdm( for category in tqdm.tqdm(
frame_per_category.keys(), frame_per_category.keys(),
......
[ {
{ "brainstorming": {
"id": 1, "id": 1,
"category": "brainstorming", "category": "brainstorming",
"metrics": { "metrics": {
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
}, },
"prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}" "prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
}, },
{ "chat": {
"id": 2, "id": 2,
"category": "chat", "category": "chat",
"metrics": { "metrics": {
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
}, },
"prompt": "你是一个好助手。请你为下面的“补全对话”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}" "prompt": "你是一个好助手。请你为下面的“补全对话”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
}, },
{ "classification": {
"id": 3, "id": 3,
"category": "classification", "category": "classification",
"metrics": { "metrics": {
...@@ -52,7 +52,7 @@ ...@@ -52,7 +52,7 @@
}, },
"prompt": "你是一个好助手。请你为下面的“分类“问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}" "prompt": "你是一个好助手。请你为下面的“分类“问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
}, },
{ "closed_qa": {
"id": 4, "id": 4,
"category": "closed_qa", "category": "closed_qa",
"metrics": { "metrics": {
...@@ -67,7 +67,7 @@ ...@@ -67,7 +67,7 @@
}, },
"prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下:\n\n{question}\n\n需要你评分的答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}" "prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下:\n\n{question}\n\n需要你评分的答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
}, },
{ "extraction": {
"id": 5, "id": 5,
"category": "extraction", "category": "extraction",
"metrics": { "metrics": {
...@@ -82,7 +82,7 @@ ...@@ -82,7 +82,7 @@
}, },
"prompt": "你是一个好助手。请你为下面的“提取”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}" "prompt": "你是一个好助手。请你为下面的“提取”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
}, },
{ "generation": {
"id": 6, "id": 6,
"category": "generation", "category": "generation",
"metrics": { "metrics": {
...@@ -97,7 +97,7 @@ ...@@ -97,7 +97,7 @@
}, },
"prompt": "你是一个好助手。请你为下面的“生成”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}" "prompt": "你是一个好助手。请你为下面的“生成”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
}, },
{ "open_qa": {
"id": 7, "id": 7,
"category": "open_qa", "category": "open_qa",
"metrics": { "metrics": {
...@@ -112,7 +112,7 @@ ...@@ -112,7 +112,7 @@
}, },
"prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}" "prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
}, },
{ "rewriting": {
"id": 8, "id": 8,
"category": "rewriting", "category": "rewriting",
"metrics": { "metrics": {
...@@ -127,7 +127,7 @@ ...@@ -127,7 +127,7 @@
}, },
"prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}" "prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
}, },
{ "roleplay": {
"id": 9, "id": 9,
"category": "roleplay", "category": "roleplay",
"metrics": { "metrics": {
...@@ -144,7 +144,7 @@ ...@@ -144,7 +144,7 @@
}, },
"prompt": "你是一个好助手。请你为下面的“角色扮演”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}" "prompt": "你是一个好助手。请你为下面的“角色扮演”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
}, },
{ "summarization": {
"id": 10, "id": 10,
"category": "summarization", "category": "summarization",
"metrics": { "metrics": {
...@@ -161,7 +161,7 @@ ...@@ -161,7 +161,7 @@
}, },
"prompt": "你是一个好助手。请你为下面的“总结”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}" "prompt": "你是一个好助手。请你为下面的“总结”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
}, },
{ "general": {
"id": 11, "id": 11,
"category": "general", "category": "general",
"metrics": { "metrics": {
...@@ -176,4 +176,4 @@ ...@@ -176,4 +176,4 @@
}, },
"prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下:\n\n{question}\n\n需要你评分的答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}" "prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下:\n\n{question}\n\n需要你评分的答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
} }
] }
...@@ -57,6 +57,7 @@ def get_data_per_category(data, categories): ...@@ -57,6 +57,7 @@ def get_data_per_category(data, categories):
data_per_category = {category: [] for category in categories} data_per_category = {category: [] for category in categories}
for item in data: for item in data:
category = item["category"] category = item["category"]
if category in categories:
data_per_category[category].append(item) data_per_category[category].append(item)
return data_per_category return data_per_category
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment