enhance latency test - part 2 (#915)

afd411d0 · min-xu-et · GitHub · e1eae1fd · afd411d0
Unverified Commit afd411d0 authored Aug 04, 2024 by min-xu-et Committed by GitHub Aug 04, 2024
Show whitespace changes
Inline Side-by-side

Showing with 75 additions and 62 deletions

python/sglang/bench_latency.py python/sglang/bench_latency.py +75 -62

No files found.
--- a/python/sglang/bench_latency.py
+++ b/python/sglang/bench_latency.py
@@ -220,47 +220,32 @@ def correctness_test(
        rank_print(tokenizer.decode(output_ids[i]))
-def latency_test(
+@torch.inference_mode()
-    server_args,
+def latency_test_run_once(
-    bench_args,
+    model_runner, rank_print, reqs, batch_size, input_len, output_len
-    tp_rank,
 ):
-    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
-    # Load the model
+    # Clear the pools.
-    model_runner, tokenizer = load_model(server_args, tp_rank)
-    rank_print(
-        f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
-    )
-    # To make this PR easier to review, for now, only do the first element in batch_size tuple.
-    bench_args.batch_size = bench_args.batch_size[0]
-    # Prepare inputs
-    reqs = prepare_synthetic_inputs_for_latency_test(
-        bench_args.batch_size, bench_args.input_len
-    )
-    def clear():
    model_runner.req_to_token_pool.clear()
    model_runner.token_to_kv_pool.clear()
-    @torch.inference_mode()
-    def run_once(output_len):
    measurement_results = {
-            "batch_size": bench_args.batch_size,
+        "run_name": "before",
+        "batch_size": batch_size,
+        "input_len": input_len,
        "output_len": output_len,
    }
+    tot_latency = 0
    # Prefill
    torch.cuda.synchronize()
-        tot_latency = 0
    tic = time.time()
    next_token_ids, _, batch = extend(reqs, model_runner)
    torch.cuda.synchronize()
    prefill_latency = time.time() - tic
    tot_latency += prefill_latency
-        throughput = bench_args.input_len * bench_args.batch_size / prefill_latency
+    throughput = input_len * batch_size / prefill_latency
    rank_print(
        f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
    )
@@ -275,24 +260,20 @@ def latency_test(
        torch.cuda.synchronize()
        latency = time.time() - tic
        tot_latency += latency
-            throughput = bench_args.batch_size / latency
+        throughput = batch_size / latency
        if i < 5:
            rank_print(
                f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
            )
    avg_decode_latency = (tot_latency - prefill_latency) / output_len
-        avg_decode_throughput = bench_args.batch_size / avg_decode_latency
+    avg_decode_throughput = batch_size / avg_decode_latency
    rank_print(
        f"Decode.  avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
    )
    measurement_results["avg_decode_latency"] = avg_decode_latency
    measurement_results["avg_decode_throughput"] = avg_decode_throughput
-        throughput = (
+    throughput = (input_len + output_len) * batch_size / tot_latency
-            (bench_args.input_len + bench_args.output_len)
-            * bench_args.batch_size
-            / tot_latency
-        )
    rank_print(
        f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
    )
@@ -300,13 +281,45 @@ def latency_test(
    measurement_results["total_throughput"] = throughput
    return measurement_results
+def latency_test(
+    server_args,
+    bench_args,
+    tp_rank,
+):
+    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
+    # Load the model
+    model_runner, tokenizer = load_model(server_args, tp_rank)
+    rank_print(
+        f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
+    )
+    # To make this PR easier to review, for now, only do the first element in batch_size tuple.
+    bench_args.batch_size = bench_args.batch_size[0]
+    # Prepare inputs
+    reqs = prepare_synthetic_inputs_for_latency_test(
+        bench_args.batch_size, bench_args.input_len
+    )
    # Warm up
-    run_once(4)
+    latency_test_run_once(
-    clear()
+        model_runner, rank_print, reqs, bench_args.batch_size, bench_args.input_len, 4
+    )
    # Run again
    result_list = []
-    result_list.append(run_once(bench_args.output_len))
+    result_list.append(
+        latency_test_run_once(
+            model_runner,
+            rank_print,
+            reqs,
+            bench_args.batch_size,
+            bench_args.input_len,
+            bench_args.output_len,
+        )
+    )
    # Write results in jsonlines format.
    if bench_args.result_filename: