"examples/vscode:/vscode.git/clone" did not exist on "111228cb396f0ed33cdeb7dc718e20d7d629d2f1"
Unverified Commit afd411d0 authored by min-xu-et's avatar min-xu-et Committed by GitHub
Browse files

enhance latency test - part 2 (#915)

parent e1eae1fd
...@@ -220,47 +220,32 @@ def correctness_test( ...@@ -220,47 +220,32 @@ def correctness_test(
rank_print(tokenizer.decode(output_ids[i])) rank_print(tokenizer.decode(output_ids[i]))
def latency_test( @torch.inference_mode()
server_args, def latency_test_run_once(
bench_args, model_runner, rank_print, reqs, batch_size, input_len, output_len
tp_rank,
): ):
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
# Load the model # Clear the pools.
model_runner, tokenizer = load_model(server_args, tp_rank)
rank_print(
f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
)
# To make this PR easier to review, for now, only do the first element in batch_size tuple.
bench_args.batch_size = bench_args.batch_size[0]
# Prepare inputs
reqs = prepare_synthetic_inputs_for_latency_test(
bench_args.batch_size, bench_args.input_len
)
def clear():
model_runner.req_to_token_pool.clear() model_runner.req_to_token_pool.clear()
model_runner.token_to_kv_pool.clear() model_runner.token_to_kv_pool.clear()
@torch.inference_mode()
def run_once(output_len):
measurement_results = { measurement_results = {
"batch_size": bench_args.batch_size, "run_name": "before",
"batch_size": batch_size,
"input_len": input_len,
"output_len": output_len, "output_len": output_len,
} }
tot_latency = 0
# Prefill # Prefill
torch.cuda.synchronize() torch.cuda.synchronize()
tot_latency = 0
tic = time.time() tic = time.time()
next_token_ids, _, batch = extend(reqs, model_runner) next_token_ids, _, batch = extend(reqs, model_runner)
torch.cuda.synchronize() torch.cuda.synchronize()
prefill_latency = time.time() - tic prefill_latency = time.time() - tic
tot_latency += prefill_latency tot_latency += prefill_latency
throughput = bench_args.input_len * bench_args.batch_size / prefill_latency throughput = input_len * batch_size / prefill_latency
rank_print( rank_print(
f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s" f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
) )
...@@ -275,24 +260,20 @@ def latency_test( ...@@ -275,24 +260,20 @@ def latency_test(
torch.cuda.synchronize() torch.cuda.synchronize()
latency = time.time() - tic latency = time.time() - tic
tot_latency += latency tot_latency += latency
throughput = bench_args.batch_size / latency throughput = batch_size / latency
if i < 5: if i < 5:
rank_print( rank_print(
f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s" f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
) )
avg_decode_latency = (tot_latency - prefill_latency) / output_len avg_decode_latency = (tot_latency - prefill_latency) / output_len
avg_decode_throughput = bench_args.batch_size / avg_decode_latency avg_decode_throughput = batch_size / avg_decode_latency
rank_print( rank_print(
f"Decode. avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s" f"Decode. avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
) )
measurement_results["avg_decode_latency"] = avg_decode_latency measurement_results["avg_decode_latency"] = avg_decode_latency
measurement_results["avg_decode_throughput"] = avg_decode_throughput measurement_results["avg_decode_throughput"] = avg_decode_throughput
throughput = ( throughput = (input_len + output_len) * batch_size / tot_latency
(bench_args.input_len + bench_args.output_len)
* bench_args.batch_size
/ tot_latency
)
rank_print( rank_print(
f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s" f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
) )
...@@ -300,13 +281,45 @@ def latency_test( ...@@ -300,13 +281,45 @@ def latency_test(
measurement_results["total_throughput"] = throughput measurement_results["total_throughput"] = throughput
return measurement_results return measurement_results
def latency_test(
server_args,
bench_args,
tp_rank,
):
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
# Load the model
model_runner, tokenizer = load_model(server_args, tp_rank)
rank_print(
f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
)
# To make this PR easier to review, for now, only do the first element in batch_size tuple.
bench_args.batch_size = bench_args.batch_size[0]
# Prepare inputs
reqs = prepare_synthetic_inputs_for_latency_test(
bench_args.batch_size, bench_args.input_len
)
# Warm up # Warm up
run_once(4) latency_test_run_once(
clear() model_runner, rank_print, reqs, bench_args.batch_size, bench_args.input_len, 4
)
# Run again # Run again
result_list = [] result_list = []
result_list.append(run_once(bench_args.output_len)) result_list.append(
latency_test_run_once(
model_runner,
rank_print,
reqs,
bench_args.batch_size,
bench_args.input_len,
bench_args.output_len,
)
)
# Write results in jsonlines format. # Write results in jsonlines format.
if bench_args.result_filename: if bench_args.result_filename:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment