Unverified Commit 9ce89bc1 authored by Ying Sheng's avatar Ying Sheng Committed by GitHub
Browse files

Update benchmark script (#571)

parent badf3fa0
...@@ -81,6 +81,7 @@ def load_model(server_args, tp_rank): ...@@ -81,6 +81,7 @@ def load_model(server_args, tp_rank):
nccl_port=28888, nccl_port=28888,
server_args=server_args, server_args=server_args,
) )
print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
server_args.tokenizer_path, server_args.tokenizer_path,
tokenizer_mode=server_args.tokenizer_mode, tokenizer_mode=server_args.tokenizer_mode,
...@@ -209,6 +210,7 @@ def latency_test( ...@@ -209,6 +210,7 @@ def latency_test(
# Load the model # Load the model
model_runner, tokenizer = load_model(server_args, tp_rank) model_runner, tokenizer = load_model(server_args, tp_rank)
print(f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}")
# Prepare inputs # Prepare inputs
reqs = prepare_synthetic_inputs(bench_args, tokenizer) reqs = prepare_synthetic_inputs(bench_args, tokenizer)
...@@ -221,22 +223,31 @@ def latency_test( ...@@ -221,22 +223,31 @@ def latency_test(
def run_once(output_len): def run_once(output_len):
# Prefill # Prefill
torch.cuda.synchronize() torch.cuda.synchronize()
tot_latency = 0
tic = time.time() tic = time.time()
next_token_ids, _, batch = extend(reqs, model_runner) next_token_ids, _, batch = extend(reqs, model_runner)
torch.cuda.synchronize() torch.cuda.synchronize()
latency = time.time() - tic prefill_latency = time.time() - tic
throughput = bench_args.input_len * bench_args.batch_size / latency tot_latency += prefill_latency
rank_print(f"Prefill. latency: {latency:6.3f} ms, throughput: {throughput:9.2f} token/s") throughput = bench_args.input_len * bench_args.batch_size / prefill_latency
rank_print(f"Prefill. latency: {prefill_latency:6.5f} ms, throughput: {throughput:9.2f} token/s")
# Decode # Decode
for _ in range(output_len): for i in range(output_len):
torch.cuda.synchronize() torch.cuda.synchronize()
tic = time.time() tic = time.time()
next_token_ids, _ = decode(next_token_ids, batch, model_runner) next_token_ids, _ = decode(next_token_ids, batch, model_runner)
torch.cuda.synchronize() torch.cuda.synchronize()
latency = time.time() - tic latency = time.time() - tic
tot_latency += latency
throughput = bench_args.batch_size / latency throughput = bench_args.batch_size / latency
rank_print(f"Decode. latency: {latency:6.3f} ms, throughput: {throughput:9.2f} token/s") if i < 5: rank_print(f"Decode. latency: {latency:6.5f} ms, throughput: {throughput:9.2f} token/s")
avg_decode_latency = (tot_latency - prefill_latency) / output_len
avg_decode_throughput = bench_args.batch_size / avg_decode_latency
rank_print(f"Decode. avg latency: {avg_decode_latency:6.5f} ms, avg throughput: {avg_decode_throughput:9.2f} token/s")
throughput = (bench_args.input_len + bench_args.output_len) * bench_args.batch_size / tot_latency
rank_print(f"Total. latency: {tot_latency:6.3f} ms, throughput: {throughput:9.2f} token/s")
# Warm up # Warm up
run_once(4) run_once(4)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment