"vscode:/vscode.git/clone" did not exist on "00a9c781fd20231a4ae4f26fae768e714d8808ec"
Unverified Commit 2506e275 authored by Yuanchen's avatar Yuanchen Committed by GitHub
Browse files

[evaluation] improvement on evaluation (#3862)



* fix a bug when the config file contains one category but the answer file doesn't contains that category

* fix Chinese prompt file

* support gpt-3.5-turbo and gpt-4 evaluation

* polish and update README

* resolve pr comments

---------
Co-authored-by: default avatarYuanchen Xu <yuanchen.xu00@gmail.com>
parent b0474878
This diff is collapsed.
......@@ -2,7 +2,7 @@
"language": "cn",
"category": {
"brainstorming": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"creativity",
......@@ -14,7 +14,7 @@
]
},
"chat": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"naturalness",
......@@ -26,7 +26,7 @@
]
},
"classification": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
......@@ -38,7 +38,7 @@
]
},
"closed_qa": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
......@@ -50,7 +50,7 @@
]
},
"extraction": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
......@@ -62,7 +62,7 @@
]
},
"generation": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"diversity"
......@@ -74,7 +74,7 @@
]
},
"open_qa": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
......@@ -84,7 +84,7 @@
]
},
"rewriting": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness"
......@@ -96,7 +96,7 @@
]
},
"roleplay": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"fidelity",
......@@ -107,7 +107,7 @@
]
},
"summarization": {
"GPT-3.5": [
"GPT": [
"language organization",
"relevance",
"correctness",
......
......@@ -39,7 +39,8 @@ def main(args):
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
# initialize evaluator
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
config["language"])
if len(args.model_name_list) == 2:
answers1 = jload(args.answer_file_list[0])
answers2 = jload(args.answer_file_list[1])
......@@ -87,6 +88,10 @@ if __name__ == '__main__':
default=[],
required=True,
help='the names of at most 2 models')
parser.add_argument('--gpt_model',
default="gpt-3.5-turbo",
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
help='which GPT model to use for evaluation')
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
args = parser.parse_args()
......
......@@ -14,13 +14,15 @@ class Evaluator(object):
"""
def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str,
Any]) -> None:
def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
gpt_model: str, language: str) -> None:
self.params = params
self.battle_prompt = battle_prompt
self.gpt_evaluation_prompt = gpt_evaluation_prompt
self.gpt_model = gpt_model
self.language = language
self.automatic_metric_stats = dict()
self.gpt35_evaluation_results = dict()
self.gpt_evaluation_results = dict()
self.battle_results = []
def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
......@@ -63,6 +65,10 @@ class Evaluator(object):
# automatic evaluation
for category in self.params:
if len(answers_per_category[category]) == 0:
print(f"Category {category} specified in your config doesn't have corresponding answers!")
continue
category_metrics = self.params[category]["Metrics"]
self.automatic_metric_stats[category] = {}
......@@ -74,17 +80,21 @@ class Evaluator(object):
for metric in category_metrics:
self.automatic_metric_stats[category].update(switch(metric=metric))
# gpt35 evaluation
# gpt evaluation
for category in self.params:
category_metrics = self.params[category]["GPT-3.5"]
if len(answers_per_category[category]) == 0:
print(f"Category {category} specified in your config doesn't have corresponding answers!")
continue
category_metrics = self.params[category]["GPT"]
prompt = self.gpt_evaluation_prompt.get(category, None)
if prompt is None:
print(f"No prompt for category {category}! Use prompt for category general now.")
prompt = self.gpt_evaluation_prompt["general"]
self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category],
prompt, category_metrics, category)
self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(answers_per_category[category], prompt,
category_metrics, category, self.gpt_model)
def save(self, path: str, model_name_list: List[str]) -> None:
"""
......@@ -106,10 +116,10 @@ class Evaluator(object):
# Save evaluation results for GPT-3.5 evaluation metrics.
all_evaluations = []
base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results")
base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
for category, evaluations in self.gpt35_evaluation_results.items():
for category, evaluations in self.gpt_evaluation_results.items():
jdump(
evaluations,
os.path.join(evaluation_results_save_path, model_name_list[0],
......@@ -121,10 +131,10 @@ class Evaluator(object):
# Start to calculate scores and save statistics.
evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations,
evaluation_statistics_save_path)
gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
evaluation_statistics_save_path)
# Save charts and csv.
evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path,
evaluation_analyses_save_path)
gpt_evaluate.analyze_gpt_evaluation_statistics(evaluation_statistics_save_path,
evaluation_analyses_save_path)
......@@ -16,7 +16,7 @@ from utils import jdump, jload
def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
"""
Get evaluation from GPT-4.
Get battle evaluation from GPT-4.
Args:
sys_prompt: prompt for the system.
......@@ -51,7 +51,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in
except Exception as e:
print(e)
time.sleep(1)
print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.")
print(f"Evaluation {id} failed after {MAX_API_RETRY} retries.")
return {"evaluation": "", "id": id}
......@@ -233,12 +233,77 @@ def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_pa
print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
def get_gpt35_evaluation(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
max_tokens: int = 2048) -> Dict[str, Any]:
def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
model: str = "gpt-3.5-turbo",
max_tokens: int = 2048) -> Dict[str, Any]:
"""
Use GPT-3.5 to evaluate one model answer.
Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
Args:
prompt: a dictionary including prompt template, CoT and metrics.
inst: the instruction that is needed to be evaluated.
metrics: the metrics for evaluation.
model: the model used to evaluate answers.
max_tokens: the maximum number of tokens to generate in the chat completion.
Returns:
An evaluation of one answer.
"""
MAX_API_RETRY = 3
question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
answer = inst["output"]
inst["evaluation"] = {}
for metric in metrics:
if prompt["metrics"].get(metric, None) is None:
raise Exception(
f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
)
for i in range(MAX_API_RETRY):
try:
response = openai.ChatCompletion.create(
model=model,
messages=[
{
"role":
"user",
"content":
prompt["prompt"].format(
question=question,
answer=answer,
metric=prompt["metrics"][metric],
steps=prompt["CoT"][metric],
),
},
],
temperature=0,
max_tokens=max_tokens,
)
inst["evaluation"][metric] = {
"response": response["choices"][0]["message"]["content"],
"logprobs": None,
}
break
except Exception as e:
print(e)
time.sleep(1)
if metric not in inst["evaluation"]:
print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
inst["evaluation"][metric] = {}
return inst
def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
max_tokens: int = 2048) -> Dict[str, Any]:
"""
Use completion model(text-davinci-003) to evaluate one model answer.
Only completion models can return log probabilities.
Args:
prompt: a dictionary including prompt template, CoT and metrics.
......@@ -283,23 +348,22 @@ def get_gpt35_evaluation(prompt: Dict[str, Any],
except Exception as e:
print(e)
time.sleep(1)
if metric not in inst["evaluation"]:
print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
inst["evaluation"][metric] = {}
return inst
def gpt35_evaluate(
answers: List[Dict],
prompt: Dict[str, Any],
metrics: List[str],
category: str,
) -> List[Dict]:
def evaluate(answers: List[Dict], prompt: Dict[str, Any], metrics: List[str], category: str, model: str) -> List[Dict]:
"""
Use GPT-3.5 to evaluate model answers and save evaluation results.
Use GPT models to evaluate model answers and save evaluation results.
Args:
answers: model answers.
prompt: prompt for GPT-3.5 evaluation.
metrics: metrics for GPT-3.5 evaluation.
prompt: prompt for GPT evaluation.
metrics: metrics for GPT evaluation.
category: the category of the model answers for evaluation.
model: the specific GPT model used to evaluate answers.
Returns:
Evaluations of the given answers.
......@@ -315,7 +379,12 @@ def gpt35_evaluate(
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for inst in answers:
future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1)
# Completion models can return log probabilities.
if model == "text-davinci-003":
future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
else:
future = executor.submit(get_gpt_evaluation_without_logprobs, prompt, inst, metrics, model, 1)
futures.append(future)
for future in tqdm.tqdm(
......@@ -334,20 +403,19 @@ def gpt35_evaluate(
def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
"""
Calculate score from log probabilities returned by text-davinci-003.
Only openai.Completion can return logprobs.
Calculate the score according to log probabilities returned by text-davinci-003.
Calculation formula:
score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.
Ref: https://arxiv.org/abs/2303.16634
This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling).
This paper proposes NLG evaluation methods using text-davinci-003(log probabilities returned by completion models) and GPT-4(probabilities obtained by sampling).
Args:
logprobs: logprobs returned by openai.Completion.
Returns:
Score of one answer.
The score of one answer.
"""
# GPT-3.5 only returns score of 1 to 5.
......@@ -369,7 +437,31 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
return score
def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
"""
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
Args:
response: logprobs returned by openai.Completion.
evaluation: the evaluation corresponds to the question.
Returns:
The score of one answer.
"""
try:
results = re.findall(r"\d", response)
if len(results) == 1:
return int(results[0])
else:
raise Exception(f"Invalid score pair. Got {evaluation}.")
except Exception as e:
return 0
def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
"""
Generate statistics for one model.
......@@ -396,7 +488,15 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
scores = {metric: [] for metric in metrics}
for evaluation in data:
for metric in metrics:
scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
if evaluation["evaluation"][metric] == {}:
# This means after 3 retries, the server still returns an error and we set the score to 0.
scores[metric].append(0)
elif evaluation["evaluation"][metric]["logprobs"] is not None:
scores[metric].append(
calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
else:
scores[metric].append(
calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation))
statistics = {}
for metric in metrics:
......@@ -414,7 +514,7 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
)
def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None:
def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
"""
Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
......@@ -474,7 +574,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->
os.makedirs(save_path)
frame_all = pd.DataFrame(frame_all)
frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv"))
frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))
for category in tqdm.tqdm(
frame_per_category.keys(),
......
[
{
{
"brainstorming": {
"id": 1,
"category": "brainstorming",
"metrics": {
......@@ -18,7 +18,7 @@
},
"prompt": "你是一个好助手。请你为下面“头脑风暴”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
{
"chat": {
"id": 2,
"category": "chat",
"metrics": {
......@@ -37,7 +37,7 @@
},
"prompt": "你是一个好助手。请你为下面的“补全对话”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
{
"classification": {
"id": 3,
"category": "classification",
"metrics": {
......@@ -52,7 +52,7 @@
},
"prompt": "你是一个好助手。请你为下面的“分类“问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
{
"closed_qa": {
"id": 4,
"category": "closed_qa",
"metrics": {
......@@ -67,7 +67,7 @@
},
"prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下:\n\n{question}\n\n需要你评分的答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
{
"extraction": {
"id": 5,
"category": "extraction",
"metrics": {
......@@ -82,7 +82,7 @@
},
"prompt": "你是一个好助手。请你为下面的“提取”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
{
"generation": {
"id": 6,
"category": "generation",
"metrics": {
......@@ -97,7 +97,7 @@
},
"prompt": "你是一个好助手。请你为下面的“生成”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
{
"open_qa": {
"id": 7,
"category": "open_qa",
"metrics": {
......@@ -112,7 +112,7 @@
},
"prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
{
"rewriting": {
"id": 8,
"category": "rewriting",
"metrics": {
......@@ -127,7 +127,7 @@
},
"prompt": "你是一个好助手。请你为下面的问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
{
"roleplay": {
"id": 9,
"category": "roleplay",
"metrics": {
......@@ -144,7 +144,7 @@
},
"prompt": "你是一个好助手。请你为下面的“角色扮演”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
{
"summarization": {
"id": 10,
"category": "summarization",
"metrics": {
......@@ -161,7 +161,7 @@
},
"prompt": "你是一个好助手。请你为下面的“总结”问题的答案打分。\n\n问题如下:\n\n{question}\n\n答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
},
{
"general": {
"id": 11,
"category": "general",
"metrics": {
......@@ -176,4 +176,4 @@
},
"prompt": "你是一个好助手。请你为下面问题的答案打分。\n\n问题如下:\n\n{question}\n\n需要你评分的答案如下:\n\n{answer}\n\n评分的指标如下:\n\n{metric}\n\n请你遵照以下的评分步骤:\n\n{steps}"
}
]
}
......@@ -57,6 +57,7 @@ def get_data_per_category(data, categories):
data_per_category = {category: [] for category in categories}
for item in data:
category = item["category"]
data_per_category[category].append(item)
if category in categories:
data_per_category[category].append(item)
return data_per_category
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment