"examples/git@developer.sourcefind.cn:OpenDAS/colossalai.git" did not exist on "d5c5bc219e22e0878a14208bc963b84d969e61f4"
Unverified Commit 34966378 authored by Yuanchen's avatar Yuanchen Committed by GitHub
Browse files

[evaluation] add automatic evaluation pipeline (#3821)



* add functions for gpt evaluation

* add automatic eval

Update eval.py

* using jload and modify the type of answers1 and answers2

* Update eval.py

Update eval.py

* Update evaluator.py

* support gpt evaluation

* update readme.md

update README.md

update READNE.md

modify readme.md

* add Chinese example for config, battle prompt and evaluation prompt file

* remove GPT-4 config

* remove sample folder

---------
Co-authored-by: default avatarYuanchen Xu <yuanchen.xu00@gmail.com>
Co-authored-by: default avatarCamille Zhong <44392324+Camille7777@users.noreply.github.com>
parent 05b8a8de
This diff is collapsed.
{
"language": "cn",
"category": {
"brainstorming": {
"GPT-3.5": [
"language organization",
"relevance",
"creativity",
"practicality",
"correctness"
],
"Metrics": [
"Distinct"
]
},
"chat": {
"GPT-3.5": [
"language organization",
"relevance",
"naturalness",
"engagingness",
"reasonableness"
],
"Metrics": [
"Distinct"
]
},
"classification": {
"GPT-3.5": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score"
]
},
"closed_qa": {
"GPT-3.5": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"extraction": {
"GPT-3.5": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Precision",
"Recall",
"F1 score"
]
},
"generation": {
"GPT-3.5": [
"language organization",
"relevance",
"diversity"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"open_qa": {
"GPT-3.5": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"Distinct"
]
},
"rewriting": {
"GPT-3.5": [
"language organization",
"relevance",
"correctness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
},
"roleplay": {
"GPT-3.5": [
"language organization",
"relevance",
"fidelity",
"creativity"
],
"Metrics": [
"Distinct"
]
},
"summarization": {
"GPT-3.5": [
"language organization",
"relevance",
"correctness",
"conciseness"
],
"Metrics": [
"BLEU",
"ROUGE",
"BERTScore"
]
}
}
}
import argparse
import json
import os
import openai
from evaluator import Evaluator
from utils import jload
def main(args):
assert len(args.answer_file_list) == len(
args.model_name_list), "The number of answer files and model names should be equal!"
# load config
config = jload(args.config_file)
if config["language"] == "cn":
# get metric settings for all categories
metrics_per_category = {}
for category in config["category"].keys():
metrics_all = {}
for metric_type, metrics in config["category"][category].items():
metrics_all[metric_type] = metrics
metrics_per_category[category] = metrics_all
battle_prompt = None
if args.battle_prompt_file:
battle_prompt = jload(args.battle_prompt_file)
gpt_evaluation_prompt = None
if args.gpt_evaluation_prompt_file:
gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)
if len(args.model_name_list) == 2 and not battle_prompt:
raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")
if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
raise Exception(
"No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!")
# initialize evaluator
evaluator = Evaluator(metrics_per_category, battle_prompt, gpt_evaluation_prompt)
if len(args.model_name_list) == 2:
answers1 = jload(args.answer_file_list[0])
answers2 = jload(args.answer_file_list[1])
assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"
evaluator.battle(answers1=answers1, answers2=answers2)
evaluator.save(args.save_path, args.model_name_list)
elif len(args.model_name_list) == 1:
targets = jload(args.target_file)
answers = jload(args.answer_file_list[0])
assert len(targets) == len(answers), "The number of target answers and model answers should be equal!"
evaluator.evaluate(answers=answers, targets=targets)
evaluator.save(args.save_path, args.model_name_list)
else:
raise ValueError("Unsupported number of answer files and model names!")
else:
raise ValueError(f'Unsupported language {config["language"]}!')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ColossalAI LLM evaluation pipeline.')
parser.add_argument('--config_file',
type=str,
default=None,
required=True,
help='path to the file of target results')
parser.add_argument('--battle_prompt_file', type=str, default=None, help='path to the prompt file for battle')
parser.add_argument('--gpt_evaluation_prompt_file',
type=str,
default=None,
help='path to the prompt file for gpt evaluation')
parser.add_argument('--target_file', type=str, default=None, help='path to the target answer (ground truth) file')
parser.add_argument('--answer_file_list',
type=str,
nargs='+',
default=[],
required=True,
help='path to the answer files of at most 2 models')
parser.add_argument('--model_name_list',
type=str,
nargs='+',
default=[],
required=True,
help='the names of at most 2 models')
parser.add_argument('--save_path', type=str, default="results", help='path to save evaluation results')
parser.add_argument('--openai_key', type=str, default=None, required=True, help='Your openai key')
args = parser.parse_args()
if args.openai_key is not None:
os.environ["OPENAI_API_KEY"] = args.openai_key
openai.api_key = os.getenv("OPENAI_API_KEY")
main(args)
python eval.py \
--config_file "path to the config file" \
--battle_prompt_file "path to the prompt file for battle" \
--gpt_evaluation_prompt_file "path to the prompt file for gpt evaluation" \
--target_file "path to the target answer file" \
--answer_file_list "path to the answer files of at most 2 models" \
--model_name_list "the names of at most 2 models" \
--save_path "path to save results" \
--openai_key "your openai key" \
# Adapted form https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py
# Copyright 2023 LM-SYS@FastChat
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
import time
import re
import concurrent.futures
import openai
import tqdm
import shortuuid
import logging
from utils import jload, jdump, get_json_list
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
MAX_API_RETRY = 3
def get_eval(sys_prompt, user_prompt: str, answer_id: int, max_tokens: int, model: str):
logging.basicConfig(level=logging.INFO)
for _ in range(MAX_API_RETRY):
try:
response = openai.ChatCompletion.create(
model=model,
messages=[{
'role': 'system',
'content': sys_prompt
}, {
'role': 'user',
'content': user_prompt,
}],
temperature=0.2,
max_tokens=max_tokens,
)
review = response['choices'][0]['message']['content']
return {"review": review, 'id': answer_id}
except Exception as e:
logger.error(e)
time.sleep(1)
logger.error(f' Review {answer_id} failed after {MAX_API_RETRY} retries.')
return 'error'
def parse_score(review):
try:
pattern = re.compile('([0-9]|10) out of 10')
sp = re.findall(pattern, review)
if len(re.findall(pattern, review)) == 2:
return [float(sp[0]), float(sp[1])]
pattern = re.compile('a score of ([0-9]|10)')
sp = re.findall(pattern, review)
if len(re.findall(pattern, review)) == 2:
return [float(sp[0]), float(sp[1])]
pattern = re.compile('([0-9]|10)/10')
sp = re.findall(pattern, review)
if len(re.findall(pattern, review)) == 2:
return [float(sp[0]), float(sp[1])]
score_pair = review.split('\n')[0]
score_pair = score_pair.replace(',', ' ')
sp = score_pair.split(' ')
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
raise Exception('Invalid score pair.')
except Exception as e:
return [-1, -1]
def gen_prompt(reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2):
reviewer_idx = 0
for idx, reviewer in enumerate(reviewer_jsons):
if reviewer['category'] == cat:
reviewer_idx = idx
break
prompt_id = reviewer_jsons[reviewer_idx]['prompt_id']
prompt_json = prompt_jsons[prompt_id-1]
assert prompt_json['prompt_id'] == prompt_id
sys_prompt = prompt_json['system_prompt']
prompt_template = prompt_json['prompt_template']
defaults = prompt_json['defaults']
prompt = prompt_template.format(
question=ques, answer_1=ans1, answer_2=ans2, **defaults)
return sys_prompt, prompt, reviewer_idx+1
def evaluate(args):
answer1_jsons = jload(args.answer_file_list[0])
answer2_jsons = jload(args.answer_file_list[1])
reviewer_jsons = get_json_list(args.reviewer_file)
prompt_jsons = get_json_list(args.prompt_file)
assert len(answer1_jsons) == len(answer2_jsons)
handles = []
review_jsons = []
total_len = len(answer1_jsons)
question_idx_list = list(range(total_len))
logger.info(
f' Total number of answers: {len(answer2_jsons)}.')
reviews = []
with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor:
futures = []
for i in question_idx_list:
assert answer1_jsons[i]['id'] == answer2_jsons[i]['id']
answer_id = answer1_jsons[i]['id']
ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instruction'] + \
" " + answer1_jsons[i]['input']
cat = answer1_jsons[i]['category']
ans1 = answer1_jsons[i]['output']
ans2 = answer2_jsons[i]['output']
sys_prompt, prompt, reviewer_id = gen_prompt(
reviewer_jsons, prompt_jsons, cat, ques, ans1, ans2)
review_id = shortuuid.uuid()
review_jsons.append({
'review_id': review_id,
'id': answer_id,
'reviewer_id': reviewer_id,
'metadata': {}
})
future = executor.submit(
get_eval, sys_prompt, prompt, answer_id, args.max_tokens, args.model)
futures.append(future)
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
reviews.append(future.result())
reviews.sort(key=lambda x: x['id'])
review_jsons.sort(key=lambda x: x['id'])
ans1_score = 0
ans2_score = 0
better_count = 0
worse_count = 0
tie_count = 0
invalid_count = 0
better_file = []
worse_file = []
tie_file = []
invalid_file = []
output_review_file = []
for idx, review in enumerate(reviews):
scores = parse_score(review['review'])
review_jsons[idx]['review'] = review['review']
review_jsons[idx]['score'] = scores
if scores[0] == -1 and scores[1] == -1:
invalid_count += 1
invalid_file.append(review_jsons[idx])
logger.info(f' Invalid score pair: {review_jsons[idx]["id"]}.')
else:
if scores[0] > scores[1]:
worse_count += 1
worse_file.append(review_jsons[idx])
elif scores[0] < scores[1]:
better_count += 1
better_file.append(review_jsons[idx])
else:
tie_count += 1
tie_file.append(review_jsons[idx])
ans1_score += scores[0]
ans2_score += scores[1]
output_review_file.append(review_jsons[idx])
better_file.sort(key=lambda x: x['id'])
worse_file.sort(key=lambda x: x['id'])
tie_file.sort(key=lambda x: x['id'])
invalid_file.sort(key=lambda x: x['id'])
output_review_file.sort(key=lambda x: x['id'])
name1 = os.path.basename(args.answer_file_list[0]).split("_answers")[0]
name2 = os.path.basename(args.answer_file_list[1]).split("_answers")[0]
prefix = f"{name1}_vs_{name2}"
jdump(better_file, os.path.join(
args.output_folder, prefix, f"{prefix}_better.json"))
jdump(worse_file, os.path.join(
args.output_folder, prefix, f"{prefix}_worse.json"))
jdump(tie_file, os.path.join(
args.output_folder, prefix, f"{prefix}_tie.json"))
jdump(invalid_file, os.path.join(
args.output_folder, prefix, f"{prefix}_invalid.json"))
jdump(output_review_file, os.path.join(
args.output_folder, prefix, f"{prefix}_review.json"))
if os.path.exists(os.path.join(args.output_folder, "results.json")):
results = jload(os.path.join(args.output_folder, "results.json"))
else:
results = {}
results[prefix] = {'model': [name1, name2], 'better': better_count, 'worse': worse_count, 'tie': tie_count, 'win_rate': better_count /
(len(reviews)-invalid_count), 'score': [ans1_score/(len(reviews)-invalid_count), ans2_score/(len(reviews)-invalid_count)]}
jdump(results, os.path.join(args.output_folder, "results.json"))
logger.info(f' Total {invalid_count} invalid score pair(s).')
logger.info(f' Model {name2} has {better_count} better answer(s).')
logger.info(f' Model {name2} has {worse_count} worse answer(s).')
logger.info(f' {tie_count} answer(s) play(s) to a tie.')
logger.info(
f' Win rate of model {name2}: {better_count/(len(reviews)-invalid_count):.2f}')
logger.info(
f' Model {name1} average score: {ans1_score/(len(reviews)-invalid_count):.2f}')
logger.info(
f' Model {name2} average score: {ans2_score/(len(reviews)-invalid_count):.2f}')
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Model evaluation.')
parser.add_argument('--answer_file_list', nargs='+', default=[])
parser.add_argument('--prompt_file')
parser.add_argument('--reviewer_file')
parser.add_argument('--output_folder', type=str, default="./output")
parser.add_argument('--openai_key', type=str, default=None)
parser.add_argument('--model', type=str, default="gpt-4")
parser.add_argument('--num_workers', type=int, default=8)
parser.add_argument('--max_tokens', type=int, default=512,
help='maximum number of tokens produced in the output')
args = parser.parse_args()
if args.openai_key is not None:
os.environ["OPENAI_API_KEY"] = args.openai_key
openai.api_key = os.getenv("OPENAI_API_KEY")
evaluate(args)
python evaluate.py \
--answer_file_list "path to answers of model 1" "path to answers of model 2" \
--prompt_file "path to prompt file" \
--reviewer_file "path to reviewer file" \
--output_folder "path to output folder" \
--openai_key "your openai key" \
--model "gpt-4" \
--num_workers 8 \
--max_tokens 512 \
import os
from typing import Any, Dict, List
import gpt_evaluate
import metrics
import pandas as pd
from utils import get_data_per_category, jdump
class Evaluator(object):
"""
A class named Evaluator includes GPT-3.5/GPT-4 evaluation
and automatic evaluation
"""
def __init__(self, params: Dict[str, Any], battle_prompt: Dict[str, Any], gpt_evaluation_prompt: Dict[str,
Any]) -> None:
self.params = params
self.battle_prompt = battle_prompt
self.gpt_evaluation_prompt = gpt_evaluation_prompt
self.automatic_metric_stats = dict()
self.gpt35_evaluation_results = dict()
self.battle_results = []
def battle(self, answers1: List[Dict], answers2: List[Dict]) -> None:
"""
Comparison between two models using GPT-4 as the reviewer.
"""
self.battle_results = gpt_evaluate.battle(answers1, answers2, self.battle_prompt)
def evaluate(self, answers: List[Dict], targets: List[Dict]) -> None:
"""
A comprehensive evaluation of the answers from the model.
The function evaluates the model's performance from different perspectives
using GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
The metrics will be decided by the config file.
"""
def switch(metric):
if metric == "BLEU":
return metrics.bleu_score(preds=predicts_list, targets=targets_list)
elif metric == "ROUGE":
return metrics.rouge_cn_score(preds=predicts_list, targets=targets_list)
elif (metric == "Distinct"):
return metrics.distinct_score(preds=predicts_list)
elif (metric == "BERTScore"):
return metrics.bert_score(preds=predicts_list, targets=targets_list)
elif (metric == "Precision"):
return metrics.precision(preds=predicts_list, targets=targets_list)
elif (metric == "Recall"):
return metrics.recall(preds=predicts_list, targets=targets_list)
elif (metric == "F1 score"):
return metrics.F1_score(preds=predicts_list, targets=targets_list)
else:
raise ValueError(f"Unexpected metric")
answers_per_category = get_data_per_category(answers, list(self.params.keys()))
targets_per_category = get_data_per_category(targets, list(self.params.keys()))
# automatic evaluation
for category in self.params:
category_metrics = self.params[category]["Metrics"]
self.automatic_metric_stats[category] = {}
targets_list = [
target["target"] if target["target"] else target["output"] for target in targets_per_category[category]
]
predicts_list = [answer["output"] for answer in answers_per_category[category]]
for metric in category_metrics:
self.automatic_metric_stats[category].update(switch(metric=metric))
# gpt35 evaluation
for category in self.params:
category_metrics = self.params[category]["GPT-3.5"]
prompt = self.gpt_evaluation_prompt.get(category, None)
if prompt is None:
print(f"No prompt for category {category}! Use prompt for category general now.")
prompt = self.gpt_evaluation_prompt["general"]
self.gpt35_evaluation_results[category] = gpt_evaluate.gpt35_evaluate(answers_per_category[category],
prompt, category_metrics, category)
def save(self, path: str, model_name_list: List[str]) -> None:
"""
Save evaluation results of GPT-3.5, GPT-4, and off-the-shelf evaluation metrics.
"""
if len(model_name_list) == 2:
save_path = os.path.join(path, "gpt_evaluate", "battle_results")
gpt_evaluate.save_battle_results(self.battle_results, model_name_list[0], model_name_list[1], save_path)
else:
# save evaluation results for automatic metrics
automatic_df = pd.DataFrame(self.automatic_metric_stats)
automatic_results_save_path = os.path.join(path, "automatic_results")
if not os.path.exists(automatic_results_save_path):
os.makedirs(automatic_results_save_path)
automatic_df.to_csv(os.path.join(automatic_results_save_path, f"{model_name_list[0]}.csv"), index=True)
# Save evaluation results for GPT-3.5 evaluation metrics.
all_evaluations = []
base_save_path = os.path.join(path, "gpt_evaluate", "gpt35_evaluate_results")
evaluation_results_save_path = os.path.join(base_save_path, "evaluation_results")
for category, evaluations in self.gpt35_evaluation_results.items():
jdump(
evaluations,
os.path.join(evaluation_results_save_path, model_name_list[0],
f"{category}_evaluation_results.json"))
all_evaluations.extend(evaluations)
jdump(all_evaluations,
os.path.join(evaluation_results_save_path, f"{model_name_list[0]}_evaluation_results.json"))
# Start to calculate scores and save statictics.
evaluation_statistics_save_path = os.path.join(base_save_path, "evaluation_statistics")
gpt_evaluate.save_gpt35_evaluation_statistics(model_name_list[0], all_evaluations,
evaluation_statistics_save_path)
# Save charts and csv.
evaluation_analyses_save_path = os.path.join(base_save_path, "evaluation_analyses")
gpt_evaluate.analyze_gpt35_evaluation_statistics(evaluation_statistics_save_path,
evaluation_analyses_save_path)
import argparse
import os
import random
import copy
import math
from tqdm import tqdm
import torch
import torch.distributed as dist
import transformers
from coati.models.bloom import BLOOMActor
from coati.models.gpt import GPTActor
from coati.models.opt import OPTActor
from coati.models.roberta import RoBERTaActor
from coati.models.llama import LlamaActor
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
from transformers import AutoTokenizer, RobertaTokenizer
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
from colossalai.logging import get_dist_logger
from utils import jload, jdump, is_rank_0
logger = get_dist_logger()
PROMPT_DICT = {
"prompt_input":
("Below is an instruction that describes a task, paired with an input that provides further context. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"),
"prompt_no_input": ("Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Response:"),
}
def generate(args):
# torch.cuda.set_per_process_memory_fraction(0.4)
if args.strategy == 'naive':
strategy = NaiveStrategy()
elif args.strategy == 'ddp':
strategy = DDPStrategy()
elif args.strategy == 'colossalai_gemini':
strategy = ColossalAIStrategy(stage=3, placement_policy='cuda')
elif args.strategy == 'colossalai_zero2':
strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
elif args.strategy == 'colossalai_zero2_cpu':
strategy = ColossalAIStrategy(stage=2, placement_policy='cpu')
else:
raise ValueError(f'Unsupported strategy "{args.strategy}"')
world_size = dist.get_world_size()
rank = dist.get_rank()
with strategy.model_init_context():
if args.model == 'gpt2':
actor = GPTActor(pretrained=args.model_path).to(
torch.cuda.current_device())
elif args.model == 'bloom':
actor = BLOOMActor(pretrained=args.model_path).to(
torch.cuda.current_device())
elif args.model == 'opt':
actor = OPTActor(pretrained=args.model_path).to(
torch.cuda.current_device())
elif args.model == 'roberta':
actor = RoBERTaActor(pretrained=args.model_path).to(
torch.cuda.current_device())
elif args.model == 'llama':
actor = LlamaActor(pretrained=args.model_path).to(
torch.float16).to(torch.cuda.current_device())
else:
raise ValueError(f'Unsupported model "{args.model}"')
if args.model == 'gpt2':
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'bloom':
tokenizer = AutoTokenizer.from_pretrained('bigscience/bloom-560m')
tokenizer.pad_token = tokenizer.eos_token
elif args.model == 'opt':
tokenizer = AutoTokenizer.from_pretrained('facebook/opt-350m')
elif args.model == 'roberta':
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
elif args.model == 'llama':
tokenizer = AutoTokenizer.from_pretrained(args.model_path,
padding_side="right",
use_fast=False,
)
tokenizer.eos_token = '<\s>'
else:
raise ValueError(f'Unsupported model "{args.model}"')
questions = []
if args.max_datasets_size is not None:
questions = random.sample(jload(args.dataset), args.max_datasets_size)
if is_rank_0():
logger.info(
f"Limiting dataset to {args.max_datasets_size} examples.")
questions = questions[rank:args.max_datasets_size:world_size]
answers = copy.deepcopy(questions)
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
sources = [
prompt_input.format_map(example) if example.get(
"input", "") != "" else prompt_no_input.format_map(example)
for example in questions
]
if is_rank_0():
logger.info("Tokenizing inputs... This may take some time...")
input_ids_list = []
for string in sources:
input_ids = tokenizer.encode(string, return_tensors='pt').squeeze(0)
input_ids_list.append(input_ids)
bar = tqdm(range(math.ceil(len(input_ids_list)/args.batch_size)),
desc=f'steps', disable=not is_rank_0())
actor.eval()
with torch.no_grad():
for i in range(0, len(input_ids_list), args.batch_size):
batch = input_ids_list[i:i+args.batch_size]
batch = [i.flip(dims=[0]) for i in batch]
batch = torch.nn.utils.rnn.pad_sequence(batch,
batch_first=True,
padding_value=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0).to(torch.cuda.current_device())
batch = batch.flip(dims=[1])
attention_mask = batch.ne(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0)
outputs = actor.model.generate(batch, attention_mask=attention_mask,
max_length=args.max_length,
do_sample=True,
top_k=50,
top_p=0.95,
num_return_sequences=1)
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
for j in range(batch.size(0)):
answers[i +
j]['output'] = outputs[j].split("### Response:")[1].strip()
bar.update()
jdump(answers, os.path.join(args.answer_path,
f'{args.model_name}_answers_rank{rank}.json'))
if is_rank_0():
logger.info(
f'Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.3f} GB')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--strategy',
choices=['naive', 'ddp', 'colossalai_gemini',
'colossalai_zero2', 'colossalai_zero2_cpu'],
default='naive')
parser.add_argument('--model', default='gpt2',
choices=['gpt2', 'bloom', 'opt', 'roberta', 'llama'])
parser.add_argument('--model_path', type=str, default=None)
parser.add_argument('--model_name', type=str, default='model')
parser.add_argument('--dataset', type=str, default=None)
parser.add_argument('--batch_size', type=int, default=1)
parser.add_argument('--max_datasets_size', type=int, default=None)
parser.add_argument('--answer_path', type=str, default="answer")
parser.add_argument('--max_length', type=int, default=1024)
args = parser.parse_args()
generate(args)
device_number=number of your devices
model_name="name of your model"
model_path="path to your model"
dataset="path to the question dataset"
answer_path="path to save the model answers"
torchrun --standalone --nproc_per_node=$device_number generate_answers.py \
--model 'llama' \
--strategy ddp \
--model_path $model_path \
--model_name $model_name \
--dataset $dataset \
--batch_size 8 \
--max_datasets_size 80 \
--answer_path $answer_path \
--max_length 512
python merge.py \
--model_name $model_name \
--shards $device_number \
--answer_path $answer_path \
for (( i=0; i<device_number; i++ )) do
rm -rf "${answer_path}/${model_name}_answers_rank${i}.json"
done
# Adapted form https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/qa_baseline_gpt35.py
# Copyright 2023 LM-SYS@FastChat
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
import time
import concurrent.futures
import openai
import tqdm
import shortuuid
import logging
from utils import jload, jdump
MODEL = 'gpt-3.5-turbo'
MAX_API_RETRY = 3
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def get_answer(question: str, max_tokens: int):
answer = question
prompt = question['instruction'] if question['input'] == "" else question['instruction'] + \
" " + question['input']
for _ in range(MAX_API_RETRY):
try:
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[{
'role': 'system',
'content': 'You are a helpful assistant.'
}, {
'role': 'user',
'content': prompt,
}],
max_tokens=max_tokens,
)
answer['output'] = response['choices'][0]['message']['content']
return answer
except Exception as e:
logger.error(e)
time.sleep(1)
logger.error(f' Answer {question["id"]} failed after {MAX_API_RETRY} retries.')
return answer
def evaluate_gpt35(args):
questions=jload(args.dataset)
logger.info(
f' Total number of answers: {len(questions)}.')
logger.info(
f' Waiting for {args.request_time_gap} seconds before sending the next request.')
answers = []
with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_workers) as executor:
futures = []
for question in questions:
future = executor.submit(get_answer, question, args.max_tokens)
futures.append(future)
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
answers.append(future.result())
answers.sort(key=lambda x: x['id'])
jdump(answers, os.path.join(args.answer_path,
f'gpt35_answers.json'))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Evaluate GPT 3.5.')
parser.add_argument('--dataset', type=str, default="questions.json")
parser.add_argument('--answer_path', type=str, default="answer")
parser.add_argument('--num_workers', type=int, default=4)
parser.add_argument('--openai_key', type=str, default=None)
parser.add_argument('--max_tokens', type=int, default=1024)
args = parser.parse_args()
if args.openai_key is not None:
os.environ["OPENAI_API_KEY"] = args.openai_key
openai.api_key = os.getenv("OPENAI_API_KEY")
evaluate_gpt35(args)
python generate_gpt35_answers.py \
--dataset "path to the question dataset" \
--answer_path "path to answer folder" \
--num_workers 4 \
--openai_key "your openai key" \
--max_tokens 512 \
import concurrent.futures
import os
import re
import time
from copy import deepcopy
from typing import Any, Dict, List
import matplotlib.pyplot as plt
import numpy as np
import openai
import pandas as pd
import seaborn as sns
import tqdm
from utils import jdump, jload
def get_battle_result(sys_prompt: str, user_prompt: str, id: int, max_tokens: int = 2048) -> Dict[str, Any]:
"""
Get evaluation from GPT-4.
Args:
sys_prompt: prompt for the system.
user_prompt: prompt for the user.
id: id of the answers for comparison.
max_tokens: the maximum number of tokens to generate in the chat completion.
Returns:
An evaluation of one comparison.
"""
MAX_API_RETRY = 3
for _ in range(MAX_API_RETRY):
try:
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": sys_prompt
},
{
"role": "user",
"content": user_prompt,
},
],
temperature=0.2,
max_tokens=max_tokens,
)
evaluation = response["choices"][0]["message"]["content"]
return {"evaluation": evaluation, "id": id}
except Exception as e:
print(e)
time.sleep(1)
print(f" Evaluation {id} failed after {MAX_API_RETRY} retries.")
return {"evaluation": "", "id": id}
def parse_battle_score(evaluation: str) -> List[float]:
"""
Parse evaluation from GPT-4 and get the scores of model 1 and 2.
Args:
evaluation: evaluation from GPT-4.
Returns:
A score pair of two different model answers.
"""
try:
pattern = re.compile("([0-9]|10) out of 10")
sp = re.findall(pattern, evaluation)
if len(re.findall(pattern, evaluation)) == 2:
return [float(sp[0]), float(sp[1])]
pattern = re.compile("a score of ([0-9]|10)")
sp = re.findall(pattern, evaluation)
if len(re.findall(pattern, evaluation)) == 2:
return [float(sp[0]), float(sp[1])]
pattern = re.compile("([0-9]|10)/10")
sp = re.findall(pattern, evaluation)
if len(re.findall(pattern, evaluation)) == 2:
return [float(sp[0]), float(sp[1])]
score_pair = evaluation.split("\n")[0]
score_pair = score_pair.replace(",", " ")
sp = score_pair.split(" ")
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
raise Exception(f"Invalid score pair. Got {evaluation}.")
except Exception as e:
return [-1, -1]
def battle(answer1: List[Dict], answer2: List[Dict], prompt_dict: Dict[str, Any]) -> List[Dict]:
"""
Use GPT-4 to compare answers of two different models.
Args:
answer1: answers of model 1.
answer2: answers of model 2.
prompt_dict: prompt for battle.
Returns:
Evaluations of all comparison pairs.
"""
assert len(answer1) == len(answer2)
handles = []
evaluation_file = []
total_len = len(answer1)
question_idx_list = list(range(total_len))
print(f" Total number of answers: {len(answer1)}.")
evaluations = []
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for i in question_idx_list:
assert answer1[i]["id"] == answer2[i]["id"]
answer_id = answer1[i]["id"]
ques = answer1[i]["instruction"] if answer1[i][
"input"] == "" else answer1[i]["instruction"] + " " + answer1[i]["input"]
cat = answer1[i]["category"]
ans1 = answer1[i]["output"]
ans2 = answer2[i]["output"]
sys_prompt = prompt_dict["system_prompt"]
prompt_template = prompt_dict["prompt_template"]
prompt = prompt_template.format(
question=ques,
answer_1=ans1,
answer_2=ans2,
prompt=prompt_dict["prompt"],
)
future = executor.submit(get_battle_result, sys_prompt, prompt, answer_id, 2048)
futures.append(future)
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
evaluations.append(future.result())
evaluations.sort(key=lambda x: x["id"])
return evaluations
def save_battle_results(evaluations: List[Dict], name1: str, name2: str, save_path: str) -> None:
"""
Save evaluation results (model 1 vs model 2) from GPT-4.
Args:
evaluations: evaluation results from GPT-4.
name1: model 1 's name.
name2: model 2 's name.
save_path: path to save battle results.
"""
evaluation_file = deepcopy(evaluations)
ans1_score = 0
ans2_score = 0
better_count = 0
worse_count = 0
tie_count = 0
invalid_count = 0
better_file = []
worse_file = []
tie_file = []
invalid_file = []
for idx, evaluation in enumerate(evaluations):
scores = parse_battle_score(evaluation["evaluation"])
evaluation_file[idx]["score"] = scores
if scores[0] == -1 and scores[1] == -1:
invalid_count += 1
invalid_file.append(evaluation_file[idx])
print(f'Invalid score pair: {evaluation_file[idx]["id"]}.')
else:
if scores[0] > scores[1]:
worse_count += 1
worse_file.append(evaluation_file[idx])
elif scores[0] < scores[1]:
better_count += 1
better_file.append(evaluation_file[idx])
else:
tie_count += 1
tie_file.append(evaluation_file[idx])
ans1_score += scores[0]
ans2_score += scores[1]
prefix = f"{name1}_vs_{name2}"
if not os.path.exists(save_path):
os.makedirs(save_path)
jdump(better_file, os.path.join(save_path, prefix, f"{name2}_better.json"))
jdump(worse_file, os.path.join(save_path, prefix, f"{name2}_worse.json"))
jdump(tie_file, os.path.join(save_path, prefix, f"{prefix}_tie.json"))
jdump(invalid_file, os.path.join(save_path, prefix, f"{prefix}_invalid.json"))
jdump(evaluation_file, os.path.join(save_path, prefix, f"{prefix}_evaluations.json"))
if os.path.exists(os.path.join(save_path, "battle_results.json")):
results = jload(os.path.join(save_path, "battle_results.json"))
else:
results = {}
results[prefix] = {
"model": [name1, name2],
"better": better_count,
"worse": worse_count,
"tie": tie_count,
"win_rate": better_count / (len(evaluations) - invalid_count),
"score": [
ans1_score / (len(evaluations) - invalid_count),
ans2_score / (len(evaluations) - invalid_count),
],
}
jdump(results, os.path.join(save_path, "battle_results.json"))
print(f"Total {invalid_count} invalid score pair(s).")
print(f"Model {name2} has {better_count} better answer(s).")
print(f"Model {name2} has {worse_count} worse answer(s).")
print(f"{tie_count} answer(s) play(s) to a tie.")
print(f"Win rate of model {name2}: {better_count/(len(evaluations)-invalid_count):.2f}")
print(f"Model {name1} average score: {ans1_score/(len(evaluations)-invalid_count):.2f}")
print(f"Model {name2} average score: {ans2_score/(len(evaluations)-invalid_count):.2f}")
def get_gpt35_evaluation(prompt: Dict[str, Any],
inst: Dict[str, Any],
metrics: List[str],
max_tokens: int = 2048) -> Dict[str, Any]:
"""
Use GPT-3.5 to evaluate one model answer.
Args:
prompt: a dictionary including prompt template, CoT and metrics.
inst: the instruction that is needed to be evaluated.
metrics: the metrics for evaluation.
max_tokens: the maximum number of tokens to generate in the completion.
Returns:
An evaluation of one answer.
"""
MAX_API_RETRY = 3
question = (inst["instruction"] if inst["input"] == "" else inst["instruction"] + " " + inst["input"])
answer = inst["output"]
inst["evaluation"] = {}
for metric in metrics:
if prompt["metrics"].get(metric, None) is None:
raise Exception(
f"Unsupported metric {metric} for category {inst['category']}! You should add this metric in the prompt file!"
)
for i in range(MAX_API_RETRY):
try:
response = openai.Completion.create(
model="text-davinci-003",
prompt=prompt["prompt"].format(
question=question,
answer=answer,
metric=prompt["metrics"][metric],
steps=prompt["CoT"][metric],
),
logprobs=5,
temperature=0,
max_tokens=max_tokens,
)
inst["evaluation"][metric] = {
"response": response["choices"][0]["text"],
"logprobs": response["choices"][0]["logprobs"]["top_logprobs"],
}
break
except Exception as e:
print(e)
time.sleep(1)
return inst
def gpt35_evaluate(
answers: List[Dict],
prompt: Dict[str, Any],
metrics: List[str],
category: str,
) -> List[Dict]:
"""
Use GPT-3.5 to evaluate model answers and save evaluation results.
Args:
answers: model answers.
prompt: prompt for GPT-3.5 evaluation.
metrics: metrics for GPT-3.5 evaluation.
category: the category of the model answers for evaluation.
Returns:
Evaluations of the given answers.
"""
print(f"The number of instances of category {category}'s is {len(answers)}.")
evaluations = []
metrics_str = ", ".join(x for x in metrics)
print(f"Category {category}'s metrics are {metrics_str}.")
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for inst in answers:
future = executor.submit(get_gpt35_evaluation, prompt, inst, metrics, 1)
futures.append(future)
for future in tqdm.tqdm(
concurrent.futures.as_completed(futures),
desc=f"{category}: ",
total=len(futures),
):
evaluations.append(future.result())
evaluations.sort(key=lambda x: x["id"])
print(f"{category} done.")
return evaluations
def calculate_scores_form_logprobs(logprobs: Dict[str, Any]) -> float:
"""
Calculate score from log probabilities returned by text-davinci-003.
Only openai.Completion can return logprobs.
Calculation formula:
score = sum(score_i * exp(value)) where score_i is the score which corresponds to the key(predicted token) and value is its log probability.
Ref: https://arxiv.org/abs/2303.16634
This paper proposes NLG evaluation methods using GPT-3.5(logprobs returned by openai api) and GPT-4(logprobs obtained by sampling).
Args:
logprobs: logprobs returned by openai.Completion.
Returns:
Score of one answer.
"""
# GPT-3.5 only returns score of 1 to 5.
prob = np.zeros(5)
for key, value in logprobs.items():
# Sometimes the key will be one byte of a unicode character which takes the form of "bytes:\\xe7".
# It is meaningless and thus we don't calculate probability.
if "bytes" in key:
continue
# results[0] is the score which corresponds to the key(predicted token).
# For example, key "5" corresponds to score 5.
results = re.findall(r"\d", key)
if len(results) == 1:
prob[int(results[0]) - 1] = prob[int(results[0]) - 1] + np.exp(value)
score = np.dot(np.arange(1, 6), prob)
return score
def save_gpt35_evaluation_statistics(model_name: str, evaluations: List[Dict], save_path: str) -> None:
"""
Generate statistics for one model.
Args:
model_name: name of the model for saving statistics.
evaluations: evaluations for all of the model answers.
save_path: path to save GPT-3.5 evaluation statistics.
"""
if not os.path.exists(save_path):
os.makedirs(save_path)
data_per_category = {}
for evaluation in evaluations:
category = evaluation["category"]
if evaluation["category"] in data_per_category.keys():
data_per_category[category].append(evaluation)
else:
data_per_category[category] = [evaluation]
all_statistics = {}
for category, data in data_per_category.items():
metrics = data[0]["evaluation"].keys()
scores = {metric: [] for metric in metrics}
for evaluation in data:
for metric in metrics:
scores[metric].append(calculate_scores_form_logprobs(evaluation["evaluation"][metric]["logprobs"][0]))
statistics = {}
for metric in metrics:
arg_sort = np.argsort(scores[metric])
statistics[metric] = {}
statistics[metric]["avg_score"] = sum(scores[metric]) / len(data)
statistics[metric]["best_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[-3:][::-1]}
statistics[metric]["worst_3"] = {data[i]["id"]: scores[metric][i] for i in arg_sort[:3]}
all_statistics[category] = statistics
jdump(
all_statistics,
os.path.join(save_path, f"{model_name}_evaluation_statistics.json"),
)
def analyze_gpt35_evaluation_statistics(statistics_path: str, save_path: str) -> None:
"""
Analyze and visualize all GPT-3.5 evaluation statistics in the given directory.
Args:
statistics_path: path to all the models' statistics.
save_path: path to save table and visualization results.
"""
if not os.path.exists(statistics_path):
raise Exception(f'The given directory "{statistics_path}" doesn\'t exist! No statistics found!')
all_statistics = {}
for file_name in os.listdir(statistics_path):
if file_name.endswith("_evaluation_statistics.json"):
model_name = file_name.split("_evaluation_statistics.json")[0]
all_statistics[model_name] = jload(os.path.join(statistics_path, file_name))
if len(list(all_statistics.keys())) == 0:
raise Exception(f'There are no statistics in the given directory "{statistics_path}"!')
frame_all = {
"model": [],
"category": [],
"metric": [],
"avg_score": [],
"best_3": [],
"worst_3": [],
}
frame_per_category = {}
for model_name, model_statistics in all_statistics.items():
for category, category_statistics in model_statistics.items():
if frame_per_category.get(category) is None:
frame_per_category[category] = {
"model": [],
"metric": [],
"avg_score": [],
"best_3": [],
"worst_3": [],
}
for metric, metric_statistics in category_statistics.items():
frame_all["model"].append(model_name)
frame_all["category"].append(category)
frame_all["metric"].append(metric)
frame_all["avg_score"].append(metric_statistics["avg_score"])
frame_all["best_3"].append(metric_statistics["best_3"])
frame_all["worst_3"].append(metric_statistics["worst_3"])
frame_per_category[category]["model"].append(model_name)
frame_per_category[category]["metric"].append(metric)
frame_per_category[category]["avg_score"].append(metric_statistics["avg_score"])
frame_per_category[category]["best_3"].append(metric_statistics["best_3"])
frame_per_category[category]["worst_3"].append(metric_statistics["worst_3"])
if not os.path.exists(save_path):
os.makedirs(save_path)
frame_all = pd.DataFrame(frame_all)
frame_all.to_csv(os.path.join(save_path, "gpt35_evaluation_statistics.csv"))
for category in tqdm.tqdm(
frame_per_category.keys(),
desc=f"category: ",
total=len(frame_per_category.keys()),
):
data = pd.DataFrame(frame_per_category[category])
sns.set()
fig = plt.figure(figsize=(16, 10))
plt.ylim((0, 5))
fig = sns.barplot(x="metric", y="avg_score", hue="model", data=data, dodge=True)
fig.set_title(f"Comparison between Different Models for Category {category.title()}")
plt.xlabel("Evaluation Metric")
plt.ylabel("Average Score")
figure = fig.get_figure()
figure.savefig(os.path.join(save_path, f"{category}.png"), dpi=400)
import argparse
import os
from utils import jload, jdump
def generate(args):
dataset = []
for i in range(args.shards):
shard = jload(os.path.join(args.answer_path,
f'{args.model_name}_answers_rank{i}.json'))
dataset.extend(shard)
dataset.sort(key=lambda x: x['id'])
jdump(dataset, os.path.join(args.answer_path,
f'{args.model_name}_answers.json'))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--model_name', type=str, default='model')
parser.add_argument('--shards', type=int, default=4)
parser.add_argument('--answer_path', type=str, default="answer")
args = parser.parse_args()
generate(args)
import statistics
import jieba
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu
from rouge_chinese import Rouge as Rouge_cn
from sklearn.metrics import f1_score, precision_score, recall_score
def bleu_score(preds: list, targets: list) -> dict:
"""Calculate BLEU Score Metric
The calculation includes BLEU-1 for unigram, BLEU-2 for bigram,
BLEU-3 for trigram and BLEU-4 for 4-gram. Unigram evaluates the
accuracy in word level, other n-gram evaluate the fluency in
sentence level.
"""
bleu_scores = {"bleu1": 0, "bleu2": 0, "bleu3": 0, "bleu4": 0}
cumulative_bleu = [0] * 4
weights = [(1. / 1., 0., 0., 0.), (1. / 2., 1. / 2., 0., 0.), (1. / 3., 1. / 3., 1. / 3., 0.),
(1. / 4., 1. / 4., 1. / 4., 1. / 4.)]
for pred, target in zip(preds, targets):
pred_list = (' '.join(jieba.cut(pred))).split()
target_list = [(' '.join(jieba.cut(target))).split()]
bleu = sentence_bleu(target_list, pred_list, weights=weights)
cumulative_bleu = [a + b for a, b in zip(cumulative_bleu, bleu)]
for i in range(len(cumulative_bleu)):
bleu_scores[f"bleu{i+1}"] = cumulative_bleu[i] / len(preds)
return bleu_scores
def rouge_cn_score(preds: list, targets: list) -> dict:
"""Calculate Chinese ROUGE Score Metric
The calculation includes ROUGE-1 for unigram, ROUGE-2 for bigram
and ROUGE-L. ROUGE-N evaluates the number of matching n-grams between
the preds and targets. ROUGE-L measures the number of matching
longest common subsequence (LCS) between preds and targets.
"""
rouge_scores = {"rouge1": {}, "rouge2": {}, "rougeL": {}}
all_preds = []
all_targets = []
for pred, target in zip(preds, targets):
pred_list = ' '.join(jieba.cut(pred))
target_list = ' '.join(jieba.cut(target))
all_preds.append(pred_list)
all_targets.append(target_list)
rouge_cn = Rouge_cn()
rouge_avg = rouge_cn.get_scores(all_preds, all_targets, avg=True)
rouge_scores["rouge1"] = rouge_avg["rouge-1"]["f"]
rouge_scores["rouge2"] = rouge_avg["rouge-2"]["f"]
rouge_scores["rougeL"] = rouge_avg["rouge-l"]["f"]
return rouge_scores
def distinct_score(preds: list) -> dict:
"""Calculate Distinct Score Metric
This metric refers to https://arxiv.org/abs/1510.03055.
It evaluates the diversity of generation text by counting
the unique n-grams.
"""
distinct_score = {"distinct": 0}
cumulative_distinct = []
for pred in preds:
pred_seg_list = list(' '.join(jieba.cut(pred)))
count_segs = len(pred_seg_list)
unique_segs = set(pred_seg_list)
count_unique_chars = len(unique_segs)
cumulative_distinct.append(count_unique_chars / count_segs)
distinct_score["distinct"] = statistics.mean(cumulative_distinct)
return distinct_score
def bert_score(preds: list, targets: list) -> dict:
"""Calculate BERTScore Metric
The BERTScore evaluates the semantic similarity between
tokens of preds and targets with BERT.
"""
bert_score = {"bert_score": 0}
pred_list = []
target_list = []
for pred, target in zip(preds, targets):
pred_list.append(' '.join(jieba.cut(pred)))
target_list.append(' '.join(jieba.cut(target)))
_, _, F = score(pred_list, target_list, lang="zh", verbose=True)
bert_score["bert_score"] = F.mean().item()
return bert_score
def calculate_precision_recall_f1(preds: list, targets: list) -> dict:
"""Precision, Recall and F1-Score Calculation
The calculation of precision, recall and f1-score is realized by counting
the number f overlaps between the preds and target. The comparison length
limited by the shorter one of preds and targets. This design is mainly
considered for classifiction and extraction categories.
"""
precision_recall_f1 = {"precision": 0, "recall": 0, "f1_score": 0}
precision_scores = []
recall_scores = []
f1_scores = []
for pred, target in zip(preds, targets):
pred_list = [char for char in pred]
target_list = [char for char in target]
target_labels = [1] * min(len(target_list), len(pred_list))
pred_labels = [int(pred_list[i] == target_list[i]) for i in range(0, min(len(target_list), len(pred_list)))]
precision_scores.append(precision_score(target_labels, pred_labels, zero_division=0))
recall_scores.append(recall_score(target_labels, pred_labels, zero_division=0))
f1_scores.append(f1_score(target_labels, pred_labels, zero_division=0))
precision_recall_f1["precision"] = statistics.mean(precision_scores)
precision_recall_f1["recall"] = statistics.mean(recall_scores)
precision_recall_f1["f1_score"] = statistics.mean(f1_scores)
return precision_recall_f1
def precision(preds: list, targets: list) -> dict:
"""Calculate Precision Metric
(design for classifiction and extraction categories)
Calculating precision by counting the number of overlaps between the preds and target.
"""
precision = {"precision": 0}
precision["precision"] = calculate_precision_recall_f1(preds, targets)["precision"]
return precision
def recall(preds: list, targets: list) -> dict:
"""Calculate Recall Metric
(design for classifiction and extraction categories)
Calculating recall by counting the number of overlaps between the preds and target.
"""
recall = {"recall": 0}
recall["recall"] = calculate_precision_recall_f1(preds, targets)["recall"]
return recall
def F1_score(preds: list, targets: list) -> dict:
"""Calculate F1-score Metric
(design for classifiction and extraction categories)
Calculating f1-score by counting the number of overlaps between the preds and target.
"""
f1 = {"f1_score": 0}
f1["f1_score"] = calculate_precision_recall_f1(preds, targets)["f1_score"]
return f1
{
"id": 1,
"system_prompt": "你是一个检查回答质量的好助手。",
"prompt_template": "[问题]\n{question}\n\n[1号AI助手的答案]\n{answer_1}\n\n[1号AI助手答案终止]\n\n[2号AI助手的答案]\n{answer_2}\n\n[2号AI助手答案终止]\n\n[要求]\n{prompt}\n\n",
"prompt": "我们需要你评价这两个AI助手回答的性能。\n请对他们的回答的有用性、相关性、准确性、详细程度进行评分。每个AI助手都会得到一个1到10分的总分,分数越高表示整体表现越好。\n请首先输出一行,该行只包含两个数值,分别表示1号和2号AI助手的分数。这两个分数之间要有一个空格。在随后的一行中,请对你的评价作出全面的解释,避免任何潜在的偏见,并确保AI助手回答的顺序不会影响您的判断。"
}
jieba
bert-score
rouge_chinese
scikit-metrics
nltk
openai
seaborn
pandas
matplotlib
numpy
[
{
"id": 0,
"instruction": "Help me summarize the following news?",
"input": "National Commercial Bank (NCB), Saudi Arabia's largest lender by assets, agreed to buy rival Samba Financial Group for $15 billion in the biggest banking takeover this year.NCB will pay 28.45 riyals ($7.58) for each Samba share, according to a statement on Sunday, valuing it at about 55.7 billion riyals. NCB will offer 0.739 new shares for each Samba share, at the lower end of the 0.736-0.787 ratio the banks set when they signed an initial framework agreement in June.The offer is a 3.5% premium to Samba's Oct. 8 closing price of 27.50 riyals and about 24% higher than the level the shares traded at before the talks were made public. Bloomberg News first reported the merger discussions.The new bank will have total assets of more than $220 billion, creating the Gulf region's third-largest lender. The entity's $46 billion market capitalization nearly matches that of Qatar National Bank QPSC, which is still the Middle East's biggest lender with about $268 billion of assets.",
"output": "NCB to pay 28.45 riyals for each Samba share. Deal will create Gulf region's third-largest lender",
"category": "closed qa"
}
]
\ No newline at end of file
...@@ -2,10 +2,6 @@ import io ...@@ -2,10 +2,6 @@ import io
import json import json
import os import os
import torch.distributed as dist
def is_rank_0() -> bool:
return not dist.is_initialized() or dist.get_rank() == 0
def _make_w_io_base(f, mode: str): def _make_w_io_base(f, mode: str):
if not isinstance(f, io.IOBase): if not isinstance(f, io.IOBase):
...@@ -15,11 +11,13 @@ def _make_w_io_base(f, mode: str): ...@@ -15,11 +11,13 @@ def _make_w_io_base(f, mode: str):
f = open(f, mode=mode) f = open(f, mode=mode)
return f return f
def _make_r_io_base(f, mode: str): def _make_r_io_base(f, mode: str):
if not isinstance(f, io.IOBase): if not isinstance(f, io.IOBase):
f = open(f, mode=mode) f = open(f, mode=mode)
return f return f
def jdump(obj, f, mode="w", indent=4, default=str): def jdump(obj, f, mode="w", indent=4, default=str):
"""Dump a str or dictionary to a file in json format. """Dump a str or dictionary to a file in json format.
Args: Args:
...@@ -38,6 +36,7 @@ def jdump(obj, f, mode="w", indent=4, default=str): ...@@ -38,6 +36,7 @@ def jdump(obj, f, mode="w", indent=4, default=str):
raise ValueError(f"Unexpected type: {type(obj)}") raise ValueError(f"Unexpected type: {type(obj)}")
f.close() f.close()
def jload(f, mode="r"): def jload(f, mode="r"):
"""Load a .json file into a dictionary.""" """Load a .json file into a dictionary."""
f = _make_r_io_base(f, mode) f = _make_r_io_base(f, mode)
...@@ -45,9 +44,19 @@ def jload(f, mode="r"): ...@@ -45,9 +44,19 @@ def jload(f, mode="r"):
f.close() f.close()
return jdict return jdict
def get_json_list(file_path): def get_json_list(file_path):
with open(file_path, 'r') as f: with open(file_path, 'r') as f:
json_list = [] json_list = []
for line in f: for line in f:
json_list.append(json.loads(line)) json_list.append(json.loads(line))
return json_list return json_list
def get_data_per_category(data, categories):
data_per_category = {category: [] for category in categories}
for item in data:
category = item["category"]
data_per_category[category].append(item)
return data_per_category
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment