test_perf.py 6.89 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import asyncio
import time
from openai import AsyncOpenAI
import argparse
import random

PROMPTS = [
    "如果猫能写诗,它们会写些什么?",
    "描述一个没有重力的世界。",
    "如果地球停止自转,会发生什么?",
    "假设你是一只会飞的鲸鱼,描述你的日常生活。",
    "如果人类可以与植物沟通,世界会变成什么样?",
    "描述一个由糖果构成的城市。",
    "如果时间旅行成为可能,你最想去哪个时代?",
    "想象一下,如果地球上只有蓝色,其他颜色都消失了。",
    "如果动物能上网,它们会浏览什么网站?",
    "描述一个没有声音的世界。",
    "如果人类可以在水下呼吸,城市会如何变化?",
    "想象一下,如果天空是绿色的,云是紫色的。",
    "如果你能与任何历史人物共进晚餐,你会选择谁?",
    "描述一个没有夜晚的星球。",
    "如果地球上只有一种语言,世界会如何运作?",
    "想象一下,如果所有的书都变成了音乐。",
    "如果你可以变成任何一种动物,你会选择什么?",
    "描述一个由机器人统治的未来世界。",
    "如果你能与任何虚构角色成为朋友,你会选择谁?",
27
    "想象一下,如果每个人都能读懂他人的思想。",
28
29
]

30
31
NUM_REQUESTS = 64
CONCURRENCY = 20
Catheriany's avatar
Catheriany committed
32
33
API_URL = "http://127.0.0.1:8000"
MODEL = "FM9G-7B"
34

Catheriany's avatar
Catheriany committed
35
36

async def benchmark_user(client, semaphore, queue, results, user_id, verbose):
37
38
39
40
41
42
    while True:
        async with semaphore:
            task_id = await queue.get()
            if task_id is None:
                queue.task_done()
                break
Catheriany's avatar
Catheriany committed
43
44

            question = random.choice(PROMPTS)
45
            try:
Catheriany's avatar
Catheriany committed
46
47
48
49
50
51
                print(f"🚀 User#{user_id} Sending request #{task_id}")

                start_time = time.time()
                stream = await client.chat.completions.create(
                    model=MODEL,
                    messages=[{"role": "user", "content": question}],
52
                    stream=True,
Catheriany's avatar
Catheriany committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
                )

                first_token_time = None
                total_tokens = 0
                answer_chunks = []

                async for chunk in stream:
                    if first_token_time is None:
                        first_token_time = time.time()
                    delta = chunk.choices[0].delta.content
                    if delta:
                        answer_chunks.append(delta)
                        total_tokens += 1
                    if chunk.choices[0].finish_reason is not None:
                        break

                end_time = time.time()

                ttft = first_token_time - start_time if first_token_time else None
                elapsed_time = end_time - start_time if start_time else None
73
74
75
76
77
78
79
80
                ms_per_token = (
                    (elapsed_time / total_tokens * 1000)
                    if total_tokens > 0 and elapsed_time
                    else None
                )
                tokens_per_second = (
                    total_tokens / elapsed_time if elapsed_time > 0 else 0
                )
Catheriany's avatar
Catheriany committed
81
82
83

                answer = "".join(answer_chunks)

84
85
86
                results.append(
                    (total_tokens, elapsed_time, tokens_per_second, ttft, ms_per_token)
                )
Catheriany's avatar
Catheriany committed
87
88
89

                if verbose:
                    print(f"\n📝 Request #{task_id} (User #{user_id})")
90
91
92
93
94
                    if ttft is not None:
                        print(f"  ⏱ 首字延迟 TTFT: {ttft:.3f}s")
                    if elapsed_time is not None:
                        print(f"  ⏱ 总耗时: {elapsed_time:.3f}s")

Catheriany's avatar
Catheriany committed
95
                    print(f"  🔤 解码 token 总数: {total_tokens}")
96
97
98
99
                    if ms_per_token is not None:
                        print(f"  📏 平均 token 解码时间: {ms_per_token:.2f} ms/token")
                    else:
                        print(f"  📏 平均 token 解码时间: N/A (no token generated)")
Catheriany's avatar
Catheriany committed
100
101
102
103
104
105
106
107
                    print(f"  ❓ 提问: {question}")
                    print(f"  💬 回答: {answer}\n")

                queue.task_done()
            except Exception as e:
                if verbose:
                    print(f"\n⚠️ Request #{task_id} (User #{user_id}) FAILED:")
                    print(f"  ❌ Error: {e}\n")
108
109
                queue.task_done()

Catheriany's avatar
Catheriany committed
110
111
112
113

async def run_benchmark(verbose=False):
    client = AsyncOpenAI(base_url=API_URL, api_key="default")
    semaphore = asyncio.Semaphore(CONCURRENCY)
114
115
    queue = asyncio.Queue()
    results = []
Catheriany's avatar
Catheriany committed
116
    for i in range(NUM_REQUESTS):
117
        await queue.put(i)
Catheriany's avatar
Catheriany committed
118
    for _ in range(CONCURRENCY):
119
120
        await queue.put(None)

Catheriany's avatar
Catheriany committed
121
    users = [
122
123
124
        asyncio.create_task(
            benchmark_user(client, semaphore, queue, results, user_id, verbose)
        )
Catheriany's avatar
Catheriany committed
125
126
        for user_id in range(CONCURRENCY)
    ]
127
128
129
130
131
132
133
134
135
136
137
138
139
140

    start_time = time.time()
    await queue.join()
    await asyncio.gather(*users)
    end_time = time.time()

    total_elapsed_time = end_time - start_time
    tokens_list = [r[0] for r in results if r and r[0] is not None]
    latencies = [r[1] for r in results if r and r[1] is not None]
    tokens_per_second_list = [r[2] for r in results if r and r[2] is not None]
    ttft_list = [r[3] for r in results if r and r[3] is not None]
    ms_per_token_list = [r[4] for r in results if r and r[4] is not None]

    successful_requests = len(results)
141
142
143
    requests_per_second = (
        successful_requests / total_elapsed_time if total_elapsed_time > 0 else 0
    )
144
    avg_latency = sum(latencies) / len(latencies) if latencies else 0
145
146
147
148
149
    avg_tokens_per_second = (
        sum(tokens_per_second_list) / len(tokens_per_second_list)
        if tokens_per_second_list
        else 0
    )
150
    avg_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0
151
152
153
    avg_ms_per_token = (
        sum(ms_per_token_list) / len(ms_per_token_list) if ms_per_token_list else None
    )
154

Catheriany's avatar
Catheriany committed
155
156
    width_label = 24
    sep = "-" * 60
157

Catheriany's avatar
Catheriany committed
158
    print(f"\n=== 📊 性能指标汇总 ({MODEL}) ===")
159
    print(sep)
Catheriany's avatar
Catheriany committed
160
161
    print(f"{'并发数':<{width_label}}: {CONCURRENCY}")
    print(f"{'请求总数':<{width_label}}: {NUM_REQUESTS}")
162
163
164
165
166
167
168
169
    print(f"{'成功请求数':<{width_label}}: {successful_requests}")
    print(f"{'总耗时':<{width_label}}: {total_elapsed_time:.2f} s")
    print(f"{'总输出token数':<{width_label}}: {sum(tokens_list)}")
    print(f"{'请求速率 (RPS)':<{width_label}}: {requests_per_second:.2f} requests/s")
    print(sep)
    print(f"{'Average latency':<{width_label}}: {avg_latency:.2f} s")
    print(f"{'Average TTFT':<{width_label}}: {avg_ttft:.2f} s")
    print(f"{'Avg time per token':<{width_label}}: {avg_ms_per_token:.2f} ms/token")
170
171
172
    print(
        f"{'Avg Token generation speed':<{width_label}}: {avg_tokens_per_second:.2f} tokens/s"
    )
173
174
175
176


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
Catheriany's avatar
Catheriany committed
177
    parser.add_argument("--verbose", action="store_true")
178
179
    args = parser.parse_args()

180
    asyncio.run(run_benchmark(args.verbose))