Unverified Commit c622bb36 authored by Frank Lee's avatar Frank Lee Committed by GitHub
Browse files

Merge pull request #3915 from FrankLeeeee/update/develop

[sync] update develop with main
parents 34966378 9c88b6cb
...@@ -130,3 +130,7 @@ class Strategy(ABC): ...@@ -130,3 +130,7 @@ class Strategy(ABC):
only_rank0: bool = True, only_rank0: bool = True,
tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None: tokenizer: Optional[PreTrainedTokenizerBase] = None) -> None:
pass pass
@abstractmethod
def get_model_state_dict_shard(self, model: nn.Module, **config):
pass
\ No newline at end of file
...@@ -186,3 +186,15 @@ class ColossalAIStrategy(DDPStrategy): ...@@ -186,3 +186,15 @@ class ColossalAIStrategy(DDPStrategy):
if self.stage == 3: if self.stage == 3:
raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now') raise RuntimeError('ColossalAI strategy with stage-3 does not support save_pretrained() now')
super().save_pretrained(model, path, only_rank0, tokenizer) super().save_pretrained(model, path, only_rank0, tokenizer)
def get_model_state_dict_shard(self, model: nn.Module, **config):
if self.stage != 3:
yield from super().get_model_state_dict_shard(model, **config)
else:
# unwrapped_model = self._unwrap_model(model)
# for module in unwrapped_model.modules():
# if isinstance(module, LoraLinear):
# module.merge_weights = True
# module.eval()
base_model: ZeroDDP = get_base_model(model)
yield from base_model.state_dict_shard(max_shard_size=1024, only_rank_0=False)
...@@ -26,19 +26,8 @@ class DDPStrategy(NaiveStrategy): ...@@ -26,19 +26,8 @@ class DDPStrategy(NaiveStrategy):
super().__init__() super().__init__()
def setup_distributed(self) -> None: def setup_distributed(self) -> None:
try: self._try_init_dist(force=True)
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
host = os.environ['MASTER_ADDR']
port = int(os.environ['MASTER_PORT'])
except KeyError as e:
raise RuntimeError(
f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
)
dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
self.set_seed(self.seed) self.set_seed(self.seed)
torch.cuda.set_device(local_rank)
def set_seed(self, seed: int) -> None: def set_seed(self, seed: int) -> None:
random.seed(seed) random.seed(seed)
......
from typing import Any, Optional import os
import sys
from collections import OrderedDict
from typing import Any, Dict, Optional
import torch import torch
import torch.distributed as dist
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
from coati.models.base import get_base_model from coati.models.base import get_base_model
from coati.replay_buffer import ReplayBuffer from coati.replay_buffer import ReplayBuffer
from coati.models.base import RewardModel
from coati.models.lora import LoraLinear
from coati.replay_buffer import ReplayBuffer
from torch.optim import Optimizer from torch.optim import Optimizer
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from transformers.modeling_utils import PreTrainedModel from transformers.modeling_utils import PreTrainedModel
...@@ -13,6 +20,15 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase ...@@ -13,6 +20,15 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from .base import Strategy from .base import Strategy
# TODO Move this to a util.py (Moving to ray.util introduces ringed import)
def get_grad_required_state_dict(model: nn.Module):
state_dict = OrderedDict()
for name, parameter in model.named_parameters():
if parameter.requires_grad:
state_dict[name] = parameter.detach()
return state_dict
class NaiveStrategy(Strategy): class NaiveStrategy(Strategy):
""" """
Strategy for single GPU. No parallelism is used. Strategy for single GPU. No parallelism is used.
...@@ -25,7 +41,7 @@ class NaiveStrategy(Strategy): ...@@ -25,7 +41,7 @@ class NaiveStrategy(Strategy):
optimizer.step() optimizer.step()
def setup_distributed(self) -> None: def setup_distributed(self) -> None:
pass self._try_init_dist(force=False)
def setup_model(self, model: nn.Module) -> nn.Module: def setup_model(self, model: nn.Module) -> nn.Module:
return model return model
...@@ -68,3 +84,45 @@ class NaiveStrategy(Strategy): ...@@ -68,3 +84,45 @@ class NaiveStrategy(Strategy):
unwrapped_model.save_pretrained(path) unwrapped_model.save_pretrained(path)
if tokenizer is not None: if tokenizer is not None:
tokenizer.save_pretrained(path) tokenizer.save_pretrained(path)
def get_model_state_dict_shard(self, model: nn.Module, **config):
# TODO: implement sharding on naive strategy
model = self.unwrap_model(model)
if 'requires_grad_only' in config and config['requires_grad_only'] == True:
state_dict = get_grad_required_state_dict(model)
else:
state_dict = model.state_dict()
if 'shard_size' in config:
shard_size = config['shard_size']
accumulate_size = 0
state_dict_shard = OrderedDict()
for name, param in state_dict.items():
state_dict_shard[name] = param
accumulate_size += param.numel() * param.element_size()
if accumulate_size >= shard_size:
accumulate_size = 0
yield state_dict_shard
state_dict_shard = OrderedDict()
if accumulate_size > 0:
yield state_dict_shard
else:
yield state_dict
def _try_init_dist(self, force: bool = False) -> None:
try:
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
host = os.environ['MASTER_ADDR']
port = int(os.environ['MASTER_PORT'])
dist.init_process_group('nccl', init_method=f'tcp://[{host}]:{port}', world_size=world_size, rank=rank)
torch.cuda.set_device(local_rank)
except KeyError as e:
if force:
raise RuntimeError(
f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
)
except Exception as e:
if force:
raise e
...@@ -27,6 +27,7 @@ class DistributedSampler: ...@@ -27,6 +27,7 @@ class DistributedSampler:
assert len(indices) == self.num_samples assert len(indices) == self.num_samples
self.indices = indices self.indices = indices
def sample(self, batch_size: int) -> list: def sample(self, batch_size: int) -> list:
sampled_indices = np.random.choice(self.indices, batch_size, replace=False) sampled_indices = np.random.choice(self.indices, batch_size, replace=False)
return [self.dataset[idx] for idx in sampled_indices] return [self.dataset[idx] for idx in sampled_indices]
This diff is collapsed.
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
"language": "cn", "language": "cn",
"category": { "category": {
"brainstorming": { "brainstorming": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"creativity", "creativity",
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
] ]
}, },
"chat": { "chat": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"naturalness", "naturalness",
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
] ]
}, },
"classification": { "classification": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"correctness" "correctness"
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
] ]
}, },
"closed_qa": { "closed_qa": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"correctness" "correctness"
...@@ -50,7 +50,7 @@ ...@@ -50,7 +50,7 @@
] ]
}, },
"extraction": { "extraction": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"correctness" "correctness"
...@@ -62,7 +62,7 @@ ...@@ -62,7 +62,7 @@
] ]
}, },
"generation": { "generation": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"diversity" "diversity"
...@@ -74,7 +74,7 @@ ...@@ -74,7 +74,7 @@
] ]
}, },
"open_qa": { "open_qa": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"correctness" "correctness"
...@@ -84,7 +84,7 @@ ...@@ -84,7 +84,7 @@
] ]
}, },
"rewriting": { "rewriting": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"correctness" "correctness"
...@@ -96,7 +96,7 @@ ...@@ -96,7 +96,7 @@
] ]
}, },
"roleplay": { "roleplay": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"fidelity", "fidelity",
...@@ -107,7 +107,7 @@ ...@@ -107,7 +107,7 @@
] ]
}, },
"summarization": { "summarization": {
"GPT-3.5": [ "GPT": [
"language organization", "language organization",
"relevance", "relevance",
"correctness", "correctness",
......
{
"language": "en",
"category": {
"brainstorming": {
"GPT": [
"language organization",
"relevance",
"creativity",
"practicality",
"correctness"
],
"Metrics": [
"Distinct"
]
},
"chat": {
"GPT": [
"language organization",
"relevance",
"naturalness",
"engagingness",
"reasonableness"
],
"Metrics": [
"Distinct"
]
},
"classification": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score"
]
},
"closed_qa": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"extraction": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score"
]
},
"generation": {
"GPT": [
"language organization",
"relevance",
"diversity"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"open_qa": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Distinct"
]
},
"rewriting": {
"GPT": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"roleplay": {
"GPT": [
"language organization",
"relevance",
"fidelity",
"creativity"
],
"Metrics": [
"Distinct"
]
},
"summarization": {
"GPT": [
"language organization",
"relevance",
"correctness",
"conciseness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
}
}
}
...@@ -14,7 +14,7 @@ def main(args): ...@@ -14,7 +14,7 @@ def main(args):
# load config # load config
config = jload(args.config_file) config = jload(args.config_file)
if config["language"] == "cn": if config["language"] in ["cn", "en"]:
# get metric settings for all categories # get metric settings for all categories
metrics_per_category = {} metrics_per_category = {}
for category in config["category"].keys(): for category in config["category"].keys():
...@@ -39,7 +39,8 @@ def main(args): ...@@ -39,7 +39,8 @@ def main(args):
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!") "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
# initialize evaluator # initialize evaluator
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt) evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt, args.gpt_model,
config["language"])
if len(args.model_name_list) == 2: if len(args.model_name_list) == 2:
answers1 = jload(args.answer_file_list[0]) answers1 = jload(args.answer_file_list[0])
answers2 = jload(args.answer_file_list[1]) answers2 = jload(args.answer_file_list[1])
...@@ -87,6 +88,10 @@ if __name__ == '__main__': ...@@ -87,6 +88,10 @@ if __name__ == '__main__':
default=[], default=[],
required=True, required=True,
help='the names of at most 2 models') help='the names of at most 2 models')
parser.add_argument('--gpt_model',
default="gpt-3.5-turbo",
choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-4"],
help='which GPT model to use for evaluation')
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results') parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key') parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
args = parser.parse_args() args = parser.parse_args()
......
...@@ -4,7 +4,7 @@ from typing import Any, Dict, List ...@@ -4,7 +4,7 @@ from typing import Any, Dict, List
import gpt_evaluate import gpt_evaluate
import metrics import metrics
import pandas as pd import pandas as pd
from utils import get_data_per_category, jdump from utils import analyze_automatic_results, get_data_per_category, save_automatic_results
class Evaluator(object): class Evaluator(object):
...@@ -14,13 +14,15 @@ class Evaluator(object): ...@@ -14,13 +14,15 @@ class Evaluator(object):
""" """
def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str, Any],
Any]) -> None: gpt_model: str, language: str) -> None:
self.params = params self.params = params
self.battle_prompt = battle_prompt self.battle_prompt = battle_prompt
self.gpt_evaluation_prompt = gpt_evaluation_prompt self.gpt_evaluation_prompt = gpt_evaluation_prompt
self.gpt_model = gpt_model
self.language = language
self.automatic_metric_stats = dict() self.automatic_metric_stats = dict()
self.gpt35_evaluation_results = dict() self.gpt_evaluation_results = dict()
self.battle_results = [] self.battle_results = []
def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None: def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
...@@ -40,21 +42,21 @@ class Evaluator(object): ...@@ -40,21 +42,21 @@ class Evaluator(object):
""" """
def switch(metric): def switch(metric, language):
if metric == "BLEU": if metric == "BLEU":
return metrics.bleu_score(preds=predicts_list, targets=targets_list) return metrics.bleu_score(preds=predicts_list, targets=targets_list, language=language)
elif metric == "ROUGE": elif metric == "ROUGE":
return metrics.rouge_cn_score(preds=predicts_list, targets=targets_list) return metrics.rouge_score(preds=predicts_list, targets=targets_list, language=language)
elif (metric == "Distinct"): elif (metric == "Distinct"):
return metrics.distinct_score(preds=predicts_list) return metrics.distinct_score(preds=predicts_list, language=language)
elif (metric == "BERTScore"): elif (metric == "BERTScore"):
return metrics.bert_score(preds=predicts_list, targets=targets_list) return metrics.bert_score(preds=predicts_list, targets=targets_list, language=language)
elif (metric == "Precision"): elif (metric == "Precision"):
return metrics.precision(preds=predicts_list, targets=targets_list) return metrics.precision(preds=predicts_list, targets=targets_list, language=language)
elif (metric == "Recall"): elif (metric == "Recall"):
return metrics.recall(preds=predicts_list, targets=targets_list) return metrics.recall(preds=predicts_list, targets=targets_list, language=language)
elif (metric == "F1 score"): elif (metric == "F1 score"):
return metrics.F1_score(preds=predicts_list, targets=targets_list) return metrics.F1_score(preds=predicts_list, targets=targets_list, language=language)
else: else:
raise ValueError(f"Unexpected metric") raise ValueError(f"Unexpected metric")
...@@ -63,6 +65,10 @@ class Evaluator(object): ...@@ -63,6 +65,10 @@ class Evaluator(object):
# automatic evaluation # automatic evaluation
for category in self.params: for category in self.params:
if len(answers_per_category[category]) == 0:
print(f"Category {category} specified in your config doesn't have corresponding answers!")
continue
category_metrics = self.params[category]["Metrics"] category_metrics = self.params[category]["Metrics"]
self.automatic_metric_stats[category] = {} self.automatic_metric_stats[category] = {}
...@@ -72,19 +78,23 @@ class Evaluator(object): ...@@ -72,19 +78,23 @@ class Evaluator(object):
predicts_list = [answer["output"] for answer in answers_per_category[category]] predicts_list = [answer["output"] for answer in answers_per_category[category]]
for metric in category_metrics: for metric in category_metrics:
self.automatic_metric_stats[category].update(switch(metric=metric)) self.automatic_metric_stats[category].update(switch(metric=metric, language=self.language))
# gpt35 evaluation # gpt evaluation
for category in self.params: for category in self.params:
category_metrics = self.params[category]["GPT-3.5"] if len(answers_per_category[category]) == 0:
print(f"Category {category} specified in your config doesn't have corresponding answers!")
continue
category_metrics = self.params[category]["GPT"]
prompt = self.gpt_evaluation_prompt.get(category, None) prompt = self.gpt_evaluation_prompt.get(category, None)
if prompt is None: if prompt is None:
print(f"No prompt for category {category}! Use prompt for category general now.") print(f"No prompt for category {category}! Use prompt for category general now.")
prompt = self.gpt_evaluation_prompt["general"] prompt = self.gpt_evaluation_prompt["general"]
self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category], self.gpt_evaluation_results[category] = gpt_evaluate.evaluate(answers_per_category[category], prompt,
prompt, category_metrics, category) category_metrics, category, self.gpt_model)
def save(self, path: str, model_name_list: List[str]) -> None: def save(self, path: str, model_name_list: List[str]) -> None:
""" """
...@@ -96,35 +106,29 @@ class Evaluator(object): ...@@ -96,35 +106,29 @@ class Evaluator(object):
save_path = os.path.join(path, "gpt_evaluate", "battle_results") save_path = os.path.join(path, "gpt_evaluate", "battle_results")
gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path) gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
else: else:
# save evaluation results for automatic metrics # Save evaluation results for automatic metrics
automatic_df = pd.DataFrame(self.automatic_metric_stats) automatic_base_save_path = os.path.join(path, "automatic_results")
automatic_results_save_path = os.path.join(automatic_base_save_path, "evaluation_results")
automatic_results_save_path = os.path.join(path, "automatic_results")
if not os.path.exists(automatic_results_save_path): save_automatic_results(model_name_list[0], self.automatic_metric_stats, automatic_results_save_path)
os.makedirs(automatic_results_save_path)
automatic_df.to_csv(os.path.join(automatic_results_save_path, f"{model_name_list[0]}.csv"), index=True) # Save charts and csv.
automatic_analyses_save_path = os.path.join(automatic_base_save_path, "evaluation_analyses")
# Save evaluation results for GPT-3.5 evaluation metrics. analyze_automatic_results(automatic_results_save_path, automatic_analyses_save_path)
all_evaluations = []
base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results") # Save evaluation results for GPT evaluation metrics.
evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results") gpt_base_save_path = os.path.join(path, "gpt_evaluate", "gpt_evaluate_results")
gpt_evaluation_results_save_path = os.path.join(gpt_base_save_path, "evaluation_results")
for category, evaluations in self.gpt35_evaluation_results.items():
jdump( all_evaluations = gpt_evaluate.save_gpt_evaluation_results(model_name_list[0], self.gpt_evaluation_results,
evaluations, gpt_evaluation_results_save_path)
os.path.join(evaluation_results_save_path, model_name_list[0],
f"{category}_evaluation_results.json")) # Start to calculate scores and save statistics.
all_evaluations.extend(evaluations) gpt_evaluation_statistics_save_path = os.path.join(gpt_base_save_path, "evaluation_statistics")
gpt_evaluate.save_gpt_evaluation_statistics(model_name_list[0], all_evaluations,
jdump(all_evaluations, gpt_evaluation_statistics_save_path)
os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json"))
# Start to calculate scores and save statictics.
evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations,
evaluation_statistics_save_path)
# Save charts and csv. # Save charts and csv.
evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses") gpt_evaluation_analyses_save_path = os.path.join(gpt_base_save_path, "evaluation_analyses")
gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path, gpt_evaluate.analyze_gpt_evaluation_statistics(gpt_evaluation_statistics_save_path,
evaluation_analyses_save_path) gpt_evaluation_analyses_save_path)
...@@ -16,7 +16,7 @@ from utils import jdump, jload ...@@ -16,7 +16,7 @@ from utils import jdump, jload
def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]: def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
""" """
Get evaluation from GPT-4. Get battle evaluation from GPT-4.
Args: Args:
sys_prompt: prompt for the system. sys_prompt: prompt for the system.
...@@ -51,7 +51,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in ...@@ -51,7 +51,7 @@ def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: in
except Exception as e: except Exception as e:
print(e) print(e)
time.sleep(1) time.sleep(1)
print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.") print(f"Evaluation {id} failed after {MAX_API_RETRY} retries.")
return {"evaluation": "", "id": id} return {"evaluation": "", "id": id}
...@@ -233,12 +233,77 @@ def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_pa ...@@ -233,12 +233,77 @@ def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_pa
print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}") print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
def get_gpt35_evaluation(prompt: Dict[str, Any], def get_gpt_evaluation_without_logprobs(prompt: Dict[str, Any],
inst: Dict[str, Any], inst: Dict[str, Any],
metrics: List[str], metrics: List[str],
model: str = "gpt-3.5-turbo",
max_tokens: int = 2048) -> Dict[str, Any]: max_tokens: int = 2048) -> Dict[str, Any]:
""" """
Use GPT-3.5 to evaluate one model answer. Use chat models(gpt-3.5-turbo or gpt-4) to evaluate one model answer.
Args:
prompt: a dictionary including prompt template, CoT and metrics.
inst: the instruction that is needed to be evaluated.
metrics: the metrics for evaluation.
model: the model used to evaluate answers.
max_tokens: the maximum number of tokens to generate in the chat completion.
Returns:
An evaluation of one answer.
"""
MAX_API_RETRY = 3
question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
answer = inst["output"]
inst["evaluation"] = {}
for metric in metrics:
if prompt["metrics"].get(metric, None) is None:
raise Exception(
f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
)
for i in range(MAX_API_RETRY):
try:
response = openai.ChatCompletion.create(
model=model,
messages=[
{
"role":
"user",
"content":
prompt["prompt"].format(
question=question,
answer=answer,
metric=prompt["metrics"][metric],
steps=prompt["CoT"][metric],
),
},
],
temperature=0,
max_tokens=max_tokens,
)
inst["evaluation"][metric] = {
"response": response["choices"][0]["message"]["content"],
"logprobs": None,
}
break
except Exception as e:
print(e)
time.sleep(1)
if metric not in inst["evaluation"]:
print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
inst["evaluation"][metric] = {}
return inst
def get_gpt_evaluation_with_logprobs(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
max_tokens: int = 2048) -> Dict[str, Any]:
"""
Use completion model(text-davinci-003) to evaluate one model answer.
Only completion models can return log probabilities.
Args: Args:
prompt: a dictionary including prompt template, CoT and metrics. prompt: a dictionary including prompt template, CoT and metrics.
...@@ -283,23 +348,22 @@ def get_gpt35_evaluation(prompt: Dict[str, Any], ...@@ -283,23 +348,22 @@ def get_gpt35_evaluation(prompt: Dict[str, Any],
except Exception as e: except Exception as e:
print(e) print(e)
time.sleep(1) time.sleep(1)
if metric not in inst["evaluation"]:
print(f"Evaluation {inst['id']} for metric {metric} failed after {MAX_API_RETRY} retries.")
inst["evaluation"][metric] = {}
return inst return inst
def gpt35_evaluate( def evaluate(answers: List[Dict], prompt: Dict[str, Any], metrics: List[str], category: str, model: str) -> List[Dict]:
answers: List[Dict],
prompt: Dict[str, Any],
metrics: List[str],
category: str,
) -> List[Dict]:
""" """
Use GPT-3.5 to evaluate model answers and save evaluation results. Use GPT models to evaluate model answers and save evaluation results.
Args: Args:
answers: model answers. answers: model answers.
prompt: prompt for GPT-3.5 evaluation. prompt: prompt for GPT evaluation.
metrics: metrics for GPT-3.5 evaluation. metrics: metrics for GPT evaluation.
category: the category of the model answers for evaluation. category: the category of the model answers for evaluation.
model: the specific GPT model used to evaluate answers.
Returns: Returns:
Evaluations of the given answers. Evaluations of the given answers.
...@@ -315,7 +379,12 @@ def gpt35_evaluate( ...@@ -315,7 +379,12 @@ def gpt35_evaluate(
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = [] futures = []
for inst in answers: for inst in answers:
future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1) # Completion models can return log probabilities.
if model == "text-davinci-003":
future = executor.submit(get_gpt_evaluation_with_logprobs, prompt, inst, metrics, 1)
else:
future = executor.submit(get_gpt_evaluation_without_logprobs, prompt, inst, metrics, model, 1)
futures.append(future) futures.append(future)
for future in tqdm.tqdm( for future in tqdm.tqdm(
...@@ -334,20 +403,19 @@ def gpt35_evaluate( ...@@ -334,20 +403,19 @@ def gpt35_evaluate(
def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float: def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
""" """
Calculate score from log probabilities returned by text-davinci-003. Calculate the score according to log probabilities returned by text-davinci-003.
Only openai.Completion can return logprobs.
Calculation formula: Calculation formula:
score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability. score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.
Ref: https://arxiv.org/abs/2303.16634 Ref: https://arxiv.org/abs/2303.16634
This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling). This paper proposes NLG evaluation methods using text-davinci-003(log probabilities returned by completion models) and GPT-4(probabilities obtained by sampling).
Args: Args:
logprobs: logprobs returned by openai.Completion. logprobs: logprobs returned by openai.Completion.
Returns: Returns:
Score of one answer. The score of one answer.
""" """
# GPT-3.5 only returns score of 1 to 5. # GPT-3.5 only returns score of 1 to 5.
...@@ -369,14 +437,59 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float: ...@@ -369,14 +437,59 @@ def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
return score return score
def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None: def calculate_scores_form_response(response: str, evaluation: Dict[str, Any]) -> int:
"""
Calculate the score from the response returned by gpt-3.5-turbo or gpt-4.
Different from text-davinci-003, this fuction directly calculates the score according to the plain response returned by gpt-3.5-turbo or gpt-4.
Although text-davinci-003 can return log probabilities, it costs ten times as much as gpt-3.5-turbo.
Args:
response: logprobs returned by openai.Completion.
evaluation: the evaluation corresponds to the question.
Returns:
The score of one answer.
"""
try:
results = re.findall(r"\d", response)
if len(results) == 1:
return int(results[0])
else:
raise Exception(f"Invalid score pair. Got {evaluation}.")
except Exception as e:
return 0
def save_gpt_evaluation_results(model_name: str, gpt_evaluation_results: Dict[str, Any],
save_path: str) -> Dict[str, Any]:
"""
Save evaluation results for different categories for one model.
Args:
model_name: name of the model for saving evaluation results.
gpt_evaluation_results: evaluations results for all of the model answers.
save_path: path to save GPT evaluation statistics.
"""
all_evaluations = []
for category, evaluations in gpt_evaluation_results.items():
jdump(evaluations, os.path.join(save_path, model_name, f"{category}_evaluation_results.json"))
all_evaluations.extend(evaluations)
jdump(all_evaluations, os.path.join(save_path, f"{model_name}_evaluation_results.json"))
return all_evaluations
def save_gpt_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
""" """
Generate statistics for one model. Generate statistics for one model.
Args: Args:
model_name: name of the model for saving statistics. model_name: name of the model for saving statistics.
evaluations: evaluations for all of the model answers. evaluations: evaluations for all of the model answers.
save_path: path to save GPT-3.5 evaluation statistics. save_path: path to save GPT evaluation statistics.
""" """
if not os.path.exists(save_path): if not os.path.exists(save_path):
...@@ -396,7 +509,15 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s ...@@ -396,7 +509,15 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
scores = {metric: [] for metric in metrics} scores = {metric: [] for metric in metrics}
for evaluation in data: for evaluation in data:
for metric in metrics: for metric in metrics:
scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0])) if evaluation["evaluation"][metric] == {}:
# This means after 3 retries, the server still returns an error and we set the score to 0.
scores[metric].append(0)
elif evaluation["evaluation"][metric]["logprobs"] is not None:
scores[metric].append(
calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
else:
scores[metric].append(
calculate_scores_form_response(evaluation["evaluation"][metric]["response"], evaluation))
statistics = {} statistics = {}
for metric in metrics: for metric in metrics:
...@@ -414,9 +535,9 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s ...@@ -414,9 +535,9 @@ def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], s
) )
def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None: def analyze_gpt_evaluation_statistics(statistics_path: str, save_path: str) -> None:
""" """
Analyze and visualize all GPT-3.5 evaluation statistics in the given directory. Analyze and visualize all GPT evaluation statistics in the given directory.
Args: Args:
statistics_path: path to all the models' statistics. statistics_path: path to all the models' statistics.
...@@ -474,7 +595,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> ...@@ -474,7 +595,7 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->
os.makedirs(save_path) os.makedirs(save_path)
frame_all = pd.DataFrame(frame_all) frame_all = pd.DataFrame(frame_all)
frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv")) frame_all.to_csv(os.path.join(save_path, "gpt_evaluation_statistics.csv"))
for category in tqdm.tqdm( for category in tqdm.tqdm(
frame_per_category.keys(), frame_per_category.keys(),
...@@ -494,3 +615,5 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> ...@@ -494,3 +615,5 @@ def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) ->
figure = fig.get_figure() figure = fig.get_figure()
figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400) figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)
plt.close()
import statistics import statistics
from typing import Dict, List
import jieba import jieba
from bert_score import score from bert_score import score
from nltk.translate.bleu_score import sentence_bleu from nltk.translate.bleu_score import sentence_bleu
from rouge_chinese import Rouge as Rouge_cn from rouge_chinese import Rouge as Rouge_cn
from rouge_score import rouge_scorer as Rouge_en
from sklearn.metrics import f1_score, precision_score, recall_score from sklearn.metrics import f1_score, precision_score, recall_score
from utils import preprocessing_text, remove_redundant_space
def bleu_score(preds: list, targets: list) -> dict: def bleu_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate BLEU Score Metric """Calculate BLEU Score Metric
The calculation includes BLEU-1 for unigram, BLEU-2 for bigram, The calculation includes BLEU-1 for unigram, BLEU-2 for bigram,
...@@ -21,8 +24,12 @@ def bleu_score(preds: list, targets: list) -> dict: ...@@ -21,8 +24,12 @@ def bleu_score(preds: list, targets: list) -> dict:
(1. / 4., 1. / 4., 1. / 4., 1. / 4.)] (1. / 4., 1. / 4., 1. / 4., 1. / 4.)]
for pred, target in zip(preds, targets): for pred, target in zip(preds, targets):
pred_list = (' '.join(jieba.cut(pred))).split() if language == "cn":
target_list = [(' '.join(jieba.cut(target))).split()] pred_list = ' '.join(jieba.cut(preprocessing_text(pred))).split()
target_list = [(' '.join(jieba.cut(preprocessing_text(target)))).split()]
elif language == "en":
pred_list = preprocessing_text(pred).split()
target_list = [preprocessing_text(target).split()]
bleu = sentence_bleu(target_list, pred_list, weights=weights) bleu = sentence_bleu(target_list, pred_list, weights=weights)
cumulative_bleu = [a + b for a, b in zip(cumulative_bleu, bleu)] cumulative_bleu = [a + b for a, b in zip(cumulative_bleu, bleu)]
...@@ -33,7 +40,7 @@ def bleu_score(preds: list, targets: list) -> dict: ...@@ -33,7 +40,7 @@ def bleu_score(preds: list, targets: list) -> dict:
return bleu_scores return bleu_scores
def rouge_cn_score(preds: list, targets: list) -> dict: def rouge_cn_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
"""Calculate Chinese ROUGE Score Metric """Calculate Chinese ROUGE Score Metric
The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
...@@ -41,13 +48,13 @@ def rouge_cn_score(preds: list, targets: list) -> dict: ...@@ -41,13 +48,13 @@ def rouge_cn_score(preds: list, targets: list) -> dict:
the preds and targets. ROUGE-L measures the number of matching the preds and targets. ROUGE-L measures the number of matching
longest common subsequence (LCS) between preds and targets. longest common subsequence (LCS) between preds and targets.
""" """
rouge_scores = {"rouge1": {}, "rouge2": {}, "rougeL": {}} rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
all_preds = [] all_preds = []
all_targets = [] all_targets = []
for pred, target in zip(preds, targets): for pred, target in zip(preds, targets):
pred_list = ' '.join(jieba.cut(pred)) pred_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(pred))))
target_list = ' '.join(jieba.cut(target)) target_list = remove_redundant_space(' '.join(jieba.cut(preprocessing_text(target))))
all_preds.append(pred_list) all_preds.append(pred_list)
all_targets.append(target_list) all_targets.append(target_list)
...@@ -61,7 +68,42 @@ def rouge_cn_score(preds: list, targets: list) -> dict: ...@@ -61,7 +68,42 @@ def rouge_cn_score(preds: list, targets: list) -> dict:
return rouge_scores return rouge_scores
def distinct_score(preds: list) -> dict: def rouge_en_score(preds: List[str], targets: List[str]) -> Dict[str, float]:
"""Calculate English ROUGE Score Metric
The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
the preds and targets. ROUGE-L measures the number of matching
longest common subsequence (LCS) between preds and targets.
"""
rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
all_preds = []
all_targets = []
rouge_en = Rouge_en.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
for pred, target in zip(preds, targets):
score = rouge_en.score(preprocessing_text(pred), preprocessing_text(target))
rouge_scores["rouge1"] += score['rouge1'].fmeasure
rouge_scores["rouge2"] += score['rouge2'].fmeasure
rouge_scores["rougeL"] += score['rougeL'].fmeasure
rouge_scores["rouge1"] = rouge_scores["rouge1"] / len(preds)
rouge_scores["rouge2"] = rouge_scores["rouge2"] / len(preds)
rouge_scores["rougeL"] = rouge_scores["rougeL"] / len(preds)
return rouge_scores
def rouge_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate ROUGE Score Metric"""
if language == "cn":
return rouge_cn_score(preds, targets)
elif language == "en":
return rouge_en_score(preds, targets)
def distinct_score(preds: List[str], language: str) -> Dict[str, float]:
"""Calculate Distinct Score Metric """Calculate Distinct Score Metric
This metric refers to https://arxiv.org/abs/1510.03055. This metric refers to https://arxiv.org/abs/1510.03055.
...@@ -72,19 +114,36 @@ def distinct_score(preds: list) -> dict: ...@@ -72,19 +114,36 @@ def distinct_score(preds: list) -> dict:
cumulative_distinct = [] cumulative_distinct = []
for pred in preds: for pred in preds:
pred_seg_list = list(' '.join(jieba.cut(pred))) if language == "cn":
pred_seg_list = ' '.join(jieba.cut(pred)).split()
count_segs = len(pred_seg_list) count_segs = len(pred_seg_list)
unique_segs = set(pred_seg_list) unique_segs = set(pred_seg_list)
count_unique_chars = len(unique_segs) count_unique_chars = len(unique_segs)
cumulative_distinct.append(count_unique_chars / count_segs) cumulative_distinct.append(count_unique_chars / count_segs)
elif language == "en":
# calculate distinct 1-gram, 2-gram, 3-gram
unique_ngram = [set() for _ in range(0, 3)]
all_ngram_count = [0 for _ in range(0, 3)]
split_pred = preprocessing_text(pred).split()
for n in range(0, 3):
for i in range(0, len(split_pred) - n):
ngram = ' '.join(split_pred[i:i + n + 1])
unique_ngram[n].add(ngram)
all_ngram_count[n] += 1
# Sometimes the answer may contain only one word. For 2-gram and 3-gram, the gram count(denominator) may be zero.
avg_distinct = [len(a) / (b + 1e-6) for a, b in zip(unique_ngram, all_ngram_count)]
cumulative_distinct.append(statistics.mean(avg_distinct))
distinct_score["distinct"] = statistics.mean(cumulative_distinct) distinct_score["distinct"] = statistics.mean(cumulative_distinct)
return distinct_score return distinct_score
def bert_score(preds: list, targets: list) -> dict: def bert_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate BERTScore Metric """Calculate BERTScore Metric
The BERTScore evaluates the semantic similarity between The BERTScore evaluates the semantic similarity between
...@@ -95,23 +154,25 @@ def bert_score(preds: list, targets: list) -> dict: ...@@ -95,23 +154,25 @@ def bert_score(preds: list, targets: list) -> dict:
target_list = [] target_list = []
for pred, target in zip(preds, targets): for pred, target in zip(preds, targets):
pred_list.append(' '.join(jieba.cut(pred))) pred_list.append(pred)
target_list.append(' '.join(jieba.cut(target))) target_list.append(target)
if language == "cn":
_, _, F = score(pred_list, target_list, lang="zh", verbose=True) _, _, F = score(pred_list, target_list, lang="zh", verbose=True)
elif language == "en":
_, _, F = score(pred_list, target_list, lang="en", verbose=True)
bert_score["bert_score"] = F.mean().item() bert_score["bert_score"] = F.mean().item()
return bert_score return bert_score
def calculate_precision_recall_f1(preds: list, targets: list) -> dict: def calculate_precision_recall_f1(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Precision, Recall and F1-Score Calculation """Precision, Recall and F1-Score Calculation
The calculation of precision, recall and f1-score is realized by counting The calculation of precision, recall and f1-score is realized by counting
the number f overlaps between the preds and target. The comparison length the number f overlaps between the preds and target. The comparison length
limited by the shorter one of preds and targets. This design is mainly limited by the shorter one of preds and targets.
considered for classifiction and extraction categories.
""" """
precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0} precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0}
precision_scores = [] precision_scores = []
...@@ -119,8 +180,12 @@ def calculate_precision_recall_f1(preds: list, targets: list) -> dict: ...@@ -119,8 +180,12 @@ def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
f1_scores = [] f1_scores = []
for pred, target in zip(preds, targets): for pred, target in zip(preds, targets):
pred_list = [char for char in pred] if language == "cn":
target_list = [char for char in target] pred_list = [char for char in ' '.join(jieba.cut(preprocessing_text(pred))).split()]
target_list = [char for char in ' '.join(jieba.cut(preprocessing_text(target))).split()]
elif language == "en":
pred_list = [char for char in preprocessing_text(pred).split()]
target_list = [char for char in preprocessing_text(target).split()]
target_labels = [1] * min(len(target_list), len(pred_list)) target_labels = [1] * min(len(target_list), len(pred_list))
pred_labels = [int(pred_list[i] == target_list[i]) for i in range(0, min(len(target_list), len(pred_list)))] pred_labels = [int(pred_list[i] == target_list[i]) for i in range(0, min(len(target_list), len(pred_list)))]
...@@ -136,34 +201,31 @@ def calculate_precision_recall_f1(preds: list, targets: list) -> dict: ...@@ -136,34 +201,31 @@ def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
return precision_recall_f1 return precision_recall_f1
def precision(preds: list, targets: list) -> dict: def precision(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate Precision Metric """Calculate Precision Metric
(design for classifiction and extraction categories)
Calculating precision by counting the number of overlaps between the preds and target. Calculating precision by counting the number of overlaps between the preds and target.
""" """
precision = {"precision": 0} precision = {"precision": 0}
precision["precision"] = calculate_precision_recall_f1(preds, targets)["precision"] precision["precision"] = calculate_precision_recall_f1(preds, targets, language)["precision"]
return precision return precision
def recall(preds: list, targets: list) -> dict: def recall(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate Recall Metric """Calculate Recall Metric
(design for classifiction and extraction categories)
Calculating recall by counting the number of overlaps between the preds and target. Calculating recall by counting the number of overlaps between the preds and target.
""" """
recall = {"recall": 0} recall = {"recall": 0}
recall["recall"] = calculate_precision_recall_f1(preds, targets)["recall"] recall["recall"] = calculate_precision_recall_f1(preds, targets, language)["recall"]
return recall return recall
def F1_score(preds: list, targets: list) -> dict: def F1_score(preds: List[str], targets: List[str], language: str) -> Dict[str, float]:
"""Calculate F1-score Metric """Calculate F1-score Metric
(design for classifiction and extraction categories)
Calculating f1-score by counting the number of overlaps between the preds and target. Calculating f1-score by counting the number of overlaps between the preds and target.
""" """
f1 = {"f1_score": 0} f1 = {"f1_score": 0}
f1["f1_score"] = calculate_precision_recall_f1(preds, targets)["f1_score"] f1["f1_score"] = calculate_precision_recall_f1(preds, targets, language)["f1_score"]
return f1 return f1
{
"id": 1,
"system_prompt": "You are a helpful and precise assistant for checking the quality of the answer. You will be given two different answers to the same question",
"prompt_template": "[Question]\n{question}\n\n[The Start of AI Assistant 1's Answer]\n{answer_1}\n\n[The End of AI Assistant 1's Answer]\n\n[The Start of AI Assistant 2's Answer]\n{answer_2}\n\n[The End of AI Assistant 2's Answer]\n\n[Requirements]\n{prompt}\n\n",
"prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."
}
...@@ -8,3 +8,5 @@ seaborn ...@@ -8,3 +8,5 @@ seaborn
pandas pandas
matplotlib matplotlib
numpy numpy
zhon
rouge_score
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment