Commit d39f1966 authored by Catheriany's avatar Catheriany
Browse files

issue/16: 添加verbose mode

parent fcb8ebdf
...@@ -28,66 +28,91 @@ PROMPTS = [ ...@@ -28,66 +28,91 @@ PROMPTS = [
"想象一下,如果每个人都能读懂他人的思想。" "想象一下,如果每个人都能读懂他人的思想。"
] ]
NUM_REQUESTS = 10
CONCURRENCY = 5
API_URL = "http://127.0.0.1:8000"
MODEL = "FM9G-7B"
async def benchmark_user(client, semaphore, queue, results, model):
async def benchmark_user(client, semaphore, queue, results, user_id, verbose):
while True: while True:
async with semaphore: async with semaphore:
task_id = await queue.get() task_id = await queue.get()
if task_id is None: if task_id is None:
queue.task_done() queue.task_done()
break break
print(f"🚀 Sending request #{task_id}")
start_time = time.time() question = random.choice(PROMPTS)
content = random.choice(PROMPTS) try:
stream = await client.chat.completions.create( print(f"🚀 User#{user_id} Sending request #{task_id}")
model=model,
messages=[{"role": "user", "content": content}], start_time = time.time()
stream=True stream = await client.chat.completions.create(
) model=MODEL,
messages=[{"role": "user", "content": question}],
first_token_time = None stream=True
total_tokens = 0 )
async for chunk in stream:
if first_token_time is None: first_token_time = None
first_token_time = time.time() total_tokens = 0
if chunk.choices[0].delta.content: answer_chunks = []
# print(chunk.choices[0].delta.content, end="", flush=True)
total_tokens += 1 async for chunk in stream:
if chunk.choices[0].finish_reason is not None: if first_token_time is None:
break first_token_time = time.time()
delta = chunk.choices[0].delta.content
end_time = time.time() if delta:
elapsed_time = end_time - start_time answer_chunks.append(delta)
ttft = first_token_time - start_time if first_token_time else None total_tokens += 1
tokens_per_second = total_tokens / elapsed_time if elapsed_time > 0 else 0 if chunk.choices[0].finish_reason is not None:
ms_per_token = (elapsed_time / total_tokens * 1000) if total_tokens > 0 else None break
results.append((total_tokens, elapsed_time, tokens_per_second, ttft, ms_per_token)) end_time = time.time()
queue.task_done()
ttft = first_token_time - start_time if first_token_time else None
elapsed_time = end_time - start_time if start_time else None
async def run_benchmark(num_requests, concurrency, llm_url, model): ms_per_token = (elapsed_time / total_tokens * 1000) if total_tokens > 0 and elapsed_time else None
client = AsyncOpenAI(base_url=llm_url, api_key="default") tokens_per_second = total_tokens / elapsed_time if elapsed_time > 0 else 0
semaphore = asyncio.Semaphore(concurrency)
answer = "".join(answer_chunks)
results.append((total_tokens, elapsed_time, tokens_per_second, ttft, ms_per_token))
if verbose:
print(f"\n📝 Request #{task_id} (User #{user_id})")
print(f" ⏱ 首字延迟 TTFT: {ttft:.3f}s")
print(f" ⏱ 总耗时: {elapsed_time:.3f}s")
print(f" 🔤 解码 token 总数: {total_tokens}")
print(f" 📏 平均 token 解码时间: {ms_per_token:.2f} ms/token")
print(f" ❓ 提问: {question}")
print(f" 💬 回答: {answer}\n")
queue.task_done()
except Exception as e:
if verbose:
print(f"\n⚠️ Request #{task_id} (User #{user_id}) FAILED:")
print(f" ❌ Error: {e}\n")
async def run_benchmark(verbose=False):
client = AsyncOpenAI(base_url=API_URL, api_key="default")
semaphore = asyncio.Semaphore(CONCURRENCY)
queue = asyncio.Queue() queue = asyncio.Queue()
results = [] results = []
for i in range(NUM_REQUESTS):
for i in range(num_requests):
await queue.put(i) await queue.put(i)
for _ in range(CONCURRENCY):
for _ in range(concurrency):
await queue.put(None) await queue.put(None)
users = [
users = [asyncio.create_task(benchmark_user(client, semaphore, queue, results, model)) for _ in range(concurrency)] asyncio.create_task(benchmark_user(client, semaphore, queue, results, user_id, verbose))
for user_id in range(CONCURRENCY)
]
start_time = time.time() start_time = time.time()
await queue.join() await queue.join()
await asyncio.gather(*users) await asyncio.gather(*users)
end_time = time.time() end_time = time.time()
# Calculate metrics
total_elapsed_time = end_time - start_time total_elapsed_time = end_time - start_time
tokens_list = [r[0] for r in results if r and r[0] is not None] tokens_list = [r[0] for r in results if r and r[0] is not None]
latencies = [r[1] for r in results if r and r[1] is not None] latencies = [r[1] for r in results if r and r[1] is not None]
...@@ -102,13 +127,13 @@ async def run_benchmark(num_requests, concurrency, llm_url, model): ...@@ -102,13 +127,13 @@ async def run_benchmark(num_requests, concurrency, llm_url, model):
avg_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0 avg_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0
avg_ms_per_token = sum(ms_per_token_list) / len(ms_per_token_list) if ms_per_token_list else None avg_ms_per_token = sum(ms_per_token_list) / len(ms_per_token_list) if ms_per_token_list else None
width_label = 18 width_label = 24
sep = "-" * 50 sep = "-" * 60
print(f"\n=== 📊 性能指标汇总 ({model}) ===") print(f"\n=== 📊 性能指标汇总 ({MODEL}) ===")
print(sep) print(sep)
print(f"{'并发数':<{width_label}}: {concurrency}") print(f"{'并发数':<{width_label}}: {CONCURRENCY}")
print(f"{'请求总数':<{width_label}}: {num_requests}") print(f"{'请求总数':<{width_label}}: {NUM_REQUESTS}")
print(f"{'成功请求数':<{width_label}}: {successful_requests}") print(f"{'成功请求数':<{width_label}}: {successful_requests}")
print(f"{'总耗时':<{width_label}}: {total_elapsed_time:.2f} s") print(f"{'总耗时':<{width_label}}: {total_elapsed_time:.2f} s")
print(f"{'总输出token数':<{width_label}}: {sum(tokens_list)}") print(f"{'总输出token数':<{width_label}}: {sum(tokens_list)}")
...@@ -118,21 +143,13 @@ async def run_benchmark(num_requests, concurrency, llm_url, model): ...@@ -118,21 +143,13 @@ async def run_benchmark(num_requests, concurrency, llm_url, model):
print(f"{'Average TTFT':<{width_label}}: {avg_ttft:.2f} s") print(f"{'Average TTFT':<{width_label}}: {avg_ttft:.2f} s")
print(f"{'Avg time per token':<{width_label}}: {avg_ms_per_token:.2f} ms/token") print(f"{'Avg time per token':<{width_label}}: {avg_ms_per_token:.2f} ms/token")
print(f"{'Avg Token generation speed':<{width_label}}: {avg_tokens_per_second:.2f} tokens/s") print(f"{'Avg Token generation speed':<{width_label}}: {avg_tokens_per_second:.2f} tokens/s")
print(sep)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--num_requests", type=int, required=True) parser.add_argument("--verbose", action="store_true")
parser.add_argument("--concurrency", type=int, required=True)
parser.add_argument("--api_url", type=str, required=True)
parser.add_argument("--model", type=str, default="FM9G-7B")
args = parser.parse_args() args = parser.parse_args()
asyncio.run(run_benchmark( asyncio.run(run_benchmark(
args.num_requests, args.verbose
args.concurrency,
args.api_url,
args.model,
)) ))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment