bench_other.py 6.71 KB
Newer Older
Lianmin Zheng's avatar
Lianmin Zheng committed
1
2
3
4
5
import argparse
import asyncio
import json
import os
import time
Liangsheng Yin's avatar
Liangsheng Yin committed
6
7
from concurrent.futures import ThreadPoolExecutor
from functools import partial
Lianmin Zheng's avatar
Lianmin Zheng committed
8
9
10
11
12
13

import numpy as np
import pandas as pd
import tiktoken
from tqdm import tqdm

Liangsheng Yin's avatar
Liangsheng Yin committed
14
15
16
17
18
19
from sglang.test.test_utils import (
    add_common_other_args_and_parse,
    call_generate_lightllm,
    call_generate_srt_raw,
    call_generate_vllm,
)
Lianmin Zheng's avatar
Lianmin Zheng committed
20
21
22
23
24
25
26
27
28
29
30
31
32

choices = ["A", "B", "C", "D"]

tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")


def format_subject(subject):
    l = subject.split("_")
    s = ""
    for entry in l:
        s += " " + entry
    return s

Liangsheng Yin's avatar
Liangsheng Yin committed
33

Lianmin Zheng's avatar
Lianmin Zheng committed
34
35
36
37
def format_example(df, idx, include_answer=True):
    prompt = df.iloc[idx, 0]
    k = df.shape[1] - 2
    for j in range(k):
Liangsheng Yin's avatar
Liangsheng Yin committed
38
        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
Lianmin Zheng's avatar
Lianmin Zheng committed
39
40
41
42
43
    prompt += "\nAnswer:"
    if include_answer:
        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
    return prompt

Liangsheng Yin's avatar
Liangsheng Yin committed
44

Lianmin Zheng's avatar
Lianmin Zheng committed
45
def gen_prompt(train_df, subject, k=-1):
Liangsheng Yin's avatar
Liangsheng Yin committed
46
47
48
    prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(
        format_subject(subject)
    )
Lianmin Zheng's avatar
Lianmin Zheng committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
    if k == -1:
        k = train_df.shape[0]
    for i in range(k):
        prompt += format_example(train_df, i)
    return prompt


model_initialized = None


def evaluate(args, subject, dev_df, test_df):
    prompts = []
    labels = []

    # Construct prompts
    k = args.ntrain
    train_prompt = gen_prompt(dev_df, subject, k)
    while len(tokenizer.encode(train_prompt)) > 1536:
        k -= 1
        train_prompt = gen_prompt(dev_df, subject, k)

    for i in range(test_df.shape[0]):
        prompt_end = format_example(test_df, i, include_answer=False)
        prompt = train_prompt + prompt_end
        prompts.append(prompt)

Liangsheng Yin's avatar
Liangsheng Yin committed
75
        label = test_df.iloc[i, test_df.shape[1] - 1]
Lianmin Zheng's avatar
Lianmin Zheng committed
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
        labels.append(label)

    preds = [None] * len(prompts)
    max_tokens = 1

    # Select backend
    global model_initialized

    if args.backend == "lightllm":
        url = f"{args.host}:{args.port}/generate"
        call_generate = partial(call_generate_lightllm, url=url, stop=None)
    elif args.backend == "vllm":
        url = f"{args.host}:{args.port}/generate"
        call_generate = partial(call_generate_vllm, url=url, stop=None)
    elif args.backend == "srt-raw":
        url = f"{args.host}:{args.port}/generate"
        call_generate = partial(call_generate_srt_raw, url=url, stop=None)
    elif args.backend == "guidance":
Liangsheng Yin's avatar
Liangsheng Yin committed
94
        from guidance import gen, models
Lianmin Zheng's avatar
Lianmin Zheng committed
95
96

        if model_initialized is None:
Liangsheng Yin's avatar
Liangsheng Yin committed
97
98
99
100
101
            model = models.LlamaCpp(
                "/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
                n_gpu_layers=-1,
                n_ctx=4096,
            )
Lianmin Zheng's avatar
Lianmin Zheng committed
102
103
104
105
106
            model_initialized = model
        else:
            model = model_initialized

        def call_generate(prompt, temperature, max_tokens):
Liangsheng Yin's avatar
Liangsheng Yin committed
107
108
109
110
111
            out = (
                model
                + prompt
                + gen(name="answer", max_tokens=max_tokens, temperature=0)
            )
Lianmin Zheng's avatar
Lianmin Zheng committed
112
113
            return out["answer"]

Lianmin Zheng's avatar
Lianmin Zheng committed
114
115
116
        # warmup
        call_generate("Hello,", temperature=1.0, max_tokens=8)

Lianmin Zheng's avatar
Lianmin Zheng committed
117
118
    elif args.backend == "lmql":
        import lmql
Liangsheng Yin's avatar
Liangsheng Yin committed
119
120
121
122

        model = lmql.model(
            "meta-llama/Llama-2-7b-chat-hf", endpoint=f"{args.host}:{args.port}"
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
123
124
125
126
127
128
129
130
131
132

        @lmql.query(model=model)
        async def program(question):
            '''lmql
            """{question}[ANSWER]""" where len(TOKENS(ANSWER)) < 2
            return ANSWER
            '''

        async def call_generate(prompt, temperature, max_tokens):
            return await program(question=prompt, temperature=temperature)
Liangsheng Yin's avatar
Liangsheng Yin committed
133

Lianmin Zheng's avatar
Lianmin Zheng committed
134
135
136
137
138
139
140
    else:
        raise ValueError(f"Invalid backend: {args.backend}")

    # Run requests
    if args.backend != "lmql":
        # Use thread pool
        def get_one_answer(i):
Liangsheng Yin's avatar
Liangsheng Yin committed
141
            pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens)
Lianmin Zheng's avatar
Lianmin Zheng committed
142
143
144
145
146
147
148
149
150
151
152
153
154
155
            preds[i] = pred.strip()[0]

        tic = time.time()
        if args.parallel == 1:
            for i in range(len(prompts)):
                get_one_answer(i)
        else:
            with ThreadPoolExecutor(args.parallel) as executor:
                executor.map(get_one_answer, list(range(len(prompts))))
    else:
        # Use asyncio
        async def batched_call(batch_size):
            for i in range(0, len(prompts), batch_size):
                tasks = []
Liangsheng Yin's avatar
Liangsheng Yin committed
156
157
                for p in prompts[i : i + batch_size]:
                    tasks.append(call_generate(p, temperature=0, max_tokens=max_tokens))
Lianmin Zheng's avatar
Lianmin Zheng committed
158
159
                rets = await asyncio.gather(*tasks)
                for j in range(len(rets)):
Liangsheng Yin's avatar
Liangsheng Yin committed
160
                    preds[i + j] = rets[j].strip()[0]
Lianmin Zheng's avatar
Lianmin Zheng committed
161
162
163
164
165
166
167
168
169
170

        tic = time.time()
        asyncio.run(batched_call(batch_size=args.parallel))
    latency = time.time() - tic

    # Compute accuracy
    cors = [pred == label for pred, label in zip(preds, labels)]
    acc = np.mean(cors)
    cors = np.array(cors)

Liangsheng Yin's avatar
Liangsheng Yin committed
171
172
173
174
175
    print(
        "Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
            acc, latency, len(prompts), subject
        )
    )
Lianmin Zheng's avatar
Lianmin Zheng committed
176
177
178
179
180

    return cors, acc, latency


def main(args):
Liangsheng Yin's avatar
Liangsheng Yin committed
181
182
183
184
185
186
187
    subjects = sorted(
        [
            f.split("_test.csv")[0]
            for f in os.listdir(os.path.join(args.data_dir, "test"))
            if "_test.csv" in f
        ]
    )
Lianmin Zheng's avatar
Lianmin Zheng committed
188
189
190
191
192

    all_cors = []
    all_latencies = []
    num_requests = 0

Liangsheng Yin's avatar
Liangsheng Yin committed
193
194
195
196
197
198
199
    for subject in tqdm(subjects[: args.nsub]):
        dev_df = pd.read_csv(
            os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None
        )[: args.ntrain]
        test_df = pd.read_csv(
            os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None
        )
Lianmin Zheng's avatar
Lianmin Zheng committed
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223

        cors, acc, latency = evaluate(args, subject, dev_df, test_df)
        all_cors.append(cors)
        all_latencies.append(latency)
        num_requests += len(test_df)

    total_latency = np.sum(all_latencies)
    print("Total latency: {:.3f}".format(total_latency))

    weighted_acc = np.mean(np.concatenate(all_cors))
    print("Average accuracy: {:.3f}".format(weighted_acc))

    # Write results
    with open(args.result_file, "a") as fout:
        value = {
            "task": "mmlu",
            "backend": args.backend,
            "num_gpus": 1,
            "latency": round(total_latency, 3),
            "accuracy": round(weighted_acc, 3),
            "num_requests": num_requests,
            "other": {
                "nsub": args.nsub,
                "parallel": args.parallel,
Liangsheng Yin's avatar
Liangsheng Yin committed
224
            },
Lianmin Zheng's avatar
Lianmin Zheng committed
225
226
227
228
229
230
231
232
233
234
235
        }
        fout.write(json.dumps(value) + "\n")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--ntrain", type=int, default=5)
    parser.add_argument("--data_dir", type=str, default="data")
    parser.add_argument("--nsub", type=int, default=60)
    args = add_common_other_args_and_parse(parser)
    main(args)