eval.py 5.27 KB
Newer Older
1
2
3
4
import argparse
import os

import openai
5
6
from colossal_eval.evaluate.evaluator import Evaluator
from colossal_eval.utils import jload
7
8
9
10


def main(args):
    assert len(args.answer_file_list) == len(
11
12
        args.model_name_list
    ), "The number of answer files and model names should be equal!"
13
14
15
16

    # load config
    config = jload(args.config_file)

17
    if config["language"] in ["cn", "en"]:
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
        # get metric settings for all categories
        metrics_per_category = {}
        for category in config["category"].keys():
            metrics_all = {}
            for metric_type, metrics in config["category"][category].items():
                metrics_all[metric_type] = metrics
            metrics_per_category[category] = metrics_all

        battle_prompt = None
        if args.battle_prompt_file:
            battle_prompt = jload(args.battle_prompt_file)

        gpt_evaluation_prompt = None
        if args.gpt_evaluation_prompt_file:
            gpt_evaluation_prompt = jload(args.gpt_evaluation_prompt_file)

        if len(args.model_name_list) == 2 and not battle_prompt:
            raise Exception("No prompt file for battle provided. Please specify the prompt file for battle!")

        if len(args.model_name_list) == 1 and not gpt_evaluation_prompt:
            raise Exception(
39
40
                "No prompt file for gpt evaluation provided. Please specify the prompt file for gpt evaluation!"
            )
41

42
43
44
45
46
        if args.gpt_model == "text-davinci-003" and args.gpt_with_reference:
            raise Exception(
                "GPT evaluation with reference is not supported for text-davinci-003. You should specify chat models such as gpt-3.5-turbo or gpt-4."
            )

47
        # initialize evaluator
48
49
50
51
52
53
54
55
        evaluator = Evaluator(
            metrics_per_category,
            battle_prompt,
            gpt_evaluation_prompt,
            args.gpt_model,
            config["language"],
            args.gpt_with_reference,
        )
56
        if len(args.model_name_list) == 2:
57
58
59
60
61
62
63
64
65
66
            answers_1 = jload(args.answer_file_list[0])
            answers_2 = jload(args.answer_file_list[1])

            answers1 = []
            for category, value in answers_1.items():
                answers1.extend(value["data"])

            answers2 = []
            for category, value in answers_2.items():
                answers2.extend(value["data"])
67
68
69
70
71
72
73
74
75

            assert len(answers1) == len(answers2), "The number of answers for two models should be equal!"

            evaluator.battle(answers1=answers1, answers2=answers2)
            evaluator.save(args.save_path, args.model_name_list)
        elif len(args.model_name_list) == 1:
            targets = jload(args.target_file)
            answers = jload(args.answer_file_list[0])

76
77
78
79
80
81
82
            references = []
            for category, value in targets["test"].items():
                references.extend(value["data"])

            predictions = []
            for category, value in answers.items():
                predictions.extend(value["data"])
83

84
85
86
87
88
89
90
            assert len(references) == len(
                predictions
            ), "The number of target answers and model answers should be equal!"

            evaluator.evaluate(
                answers=predictions, targets=references, save_path=args.save_path, model_name=args.model_name_list[0]
            )
91
92
93
94
95
96
97
            evaluator.save(args.save_path, args.model_name_list)
        else:
            raise ValueError("Unsupported number of answer files and model names!")
    else:
        raise ValueError(f'Unsupported language {config["language"]}!')


98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="ColossalAI LLM evaluation pipeline.")
    parser.add_argument(
        "--config_file", type=str, default=None, required=True, help="path to the file of target results"
    )
    parser.add_argument("--battle_prompt_file", type=str, default=None, help="path to the prompt file for battle")
    parser.add_argument(
        "--gpt_evaluation_prompt_file", type=str, default=None, help="path to the prompt file for gpt evaluation"
    )
    parser.add_argument("--target_file", type=str, default=None, help="path to the target answer (ground truth) file")
    parser.add_argument(
        "--answer_file_list",
        type=str,
        nargs="+",
        default=[],
        required=True,
        help="path to the answer files of at most 2 models",
    )
    parser.add_argument(
        "--model_name_list", type=str, nargs="+", default=[], required=True, help="the names of at most 2 models"
    )
    parser.add_argument(
        "--gpt_model",
121
122
        default="gpt-3.5-turbo-16k",
        choices=["text-davinci-003", "gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4"],
123
124
125
126
127
128
129
130
131
132
        help="which GPT model to use for evaluation",
    )
    parser.add_argument(
        "--gpt_with_reference",
        default=False,
        action="store_true",
        help="whether to include reference answer in gpt evaluation",
    )
    parser.add_argument("--save_path", type=str, default="results", help="path to save evaluation results")
    parser.add_argument("--openai_key", type=str, default=None, required=True, help="Your openai key")
133
134
135
136
137
138
139
    args = parser.parse_args()

    if args.openai_key is not None:
        os.environ["OPENAI_API_KEY"] = args.openai_key
    openai.api_key = os.getenv("OPENAI_API_KEY")

    main(args)