chore: make precentile plots in router benchmark (#5476)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

chore: make precentile plots in router benchmark (#5476)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
d83ab662 · Yan Ru Pei · GitHub · 34c4882d · d83ab662
Unverified Commit d83ab662 authored Jan 16, 2026 by Yan Ru Pei Committed by GitHub Jan 16, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 85 additions and 33 deletions

benchmarks/router/prefix_ratio_benchmark.py benchmarks/router/prefix_ratio_benchmark.py +85 -33

No files found.
--- a/benchmarks/router/prefix_ratio_benchmark.py
+++ b/benchmarks/router/prefix_ratio_benchmark.py
@@ -84,6 +84,8 @@ def get_aiperf_cmd(
        str(num_prefix_prompts),
        "--artifact-dir",
        artifact_dir,
+        "--dataset-sampling-strategy",
+        "shuffle",
        "-H",
        "Authorization: Bearer NOT USED",
        "-H",
@@ -157,20 +159,30 @@ def aggregate_results(results: List[Optional[Dict]]) -> Optional[Dict]:
    if not results:
        return None

-    # For TTFT, we take the average across all URLs
-    # For throughput, we sum across all URLs (total system throughput)
-    ttft_values = [r["time_to_first_token"]["avg"] for r in results if r is not None]
-    throughput_values = [
-        r["output_token_throughput"]["avg"] for r in results if r is not None
-    ]
-
-    if not ttft_values or not throughput_values:
+    valid_results = [r for r in results if r is not None]
+    if not valid_results:
        return None

+    # For TTFT percentiles, average across URLs
+    ttft_p25_values = [r["time_to_first_token"]["p25"] for r in valid_results]
+    ttft_p50_values = [r["time_to_first_token"]["p50"] for r in valid_results]
+    ttft_p75_values = [r["time_to_first_token"]["p75"] for r in valid_results]
+
+    # For ITL percentiles, average across URLs
+    itl_p25_values = [r["inter_token_latency"]["p25"] for r in valid_results]
+    itl_p50_values = [r["inter_token_latency"]["p50"] for r in valid_results]
+    itl_p75_values = [r["inter_token_latency"]["p75"] for r in valid_results]
+
    aggregated = {
-        "time_to_first_token": {"avg": sum(ttft_values) / len(ttft_values)},
-        "output_token_throughput": {
-            "avg": sum(throughput_values)  # Total throughput across all URLs
+        "time_to_first_token": {
+            "p25": sum(ttft_p25_values) / len(ttft_p25_values),
+            "p50": sum(ttft_p50_values) / len(ttft_p50_values),
+            "p75": sum(ttft_p75_values) / len(ttft_p75_values),
+        },
+        "inter_token_latency": {
+            "p25": sum(itl_p25_values) / len(itl_p25_values),
+            "p50": sum(itl_p50_values) / len(itl_p50_values),
+            "p75": sum(itl_p75_values) / len(itl_p75_values),
        },
    }

@@ -328,8 +340,12 @@ def main():

    # Store results
    prefix_ratios = []
-    ttft_values = []
-    throughput_values = []
+    ttft_p25_values = []
+    ttft_p50_values = []
+    ttft_p75_values = []
+    itl_p25_values = []
+    itl_p50_values = []
+    itl_p75_values = []

    current_seed = args.seed

@@ -350,50 +366,82 @@ def main():
        )

        if result is not None:
-            ttft = result["time_to_first_token"]["avg"]
-            throughput = result["output_token_throughput"]["avg"]
+            ttft = result["time_to_first_token"]
+            itl = result["inter_token_latency"]

            prefix_ratios.append(prefix_ratio)
-            ttft_values.append(ttft)
-            throughput_values.append(throughput)
+            ttft_p25_values.append(ttft["p25"])
+            ttft_p50_values.append(ttft["p50"])
+            ttft_p75_values.append(ttft["p75"])
+            itl_p25_values.append(itl["p25"])
+            itl_p50_values.append(itl["p50"])
+            itl_p75_values.append(itl["p75"])

            logger.info(
-                f"Prefix ratio {prefix_ratio}: TTFT={ttft:.2f}ms, Throughput={throughput:.2f} tokens/s"
+                f"Prefix ratio {prefix_ratio}: TTFT p50={ttft['p50']:.2f}ms (p25={ttft['p25']:.2f}, p75={ttft['p75']:.2f}), "
+                f"ITL p50={itl['p50']:.2f}ms (p25={itl['p25']:.2f}, p75={itl['p75']:.2f})"
            )

        current_seed += 1

    # Create plots
-    if prefix_ratios and ttft_values and throughput_values:
-        # Plot TTFT vs Prefix Ratio
+    if prefix_ratios and ttft_p50_values and itl_p50_values:
        plt.figure(figsize=(12, 5))

+        # Plot TTFT vs Prefix Ratio with shaded p25-p75 region
        plt.subplot(1, 2, 1)
-        plt.plot(prefix_ratios, ttft_values, "bo-", linewidth=2, markersize=8)
+        plt.fill_between(
+            prefix_ratios,
+            ttft_p25_values,
+            ttft_p75_values,
+            alpha=0.3,
+            color="blue",
+            label="p25-p75",
+        )
+        plt.plot(
+            prefix_ratios,
+            ttft_p50_values,
+            "bo-",
+            linewidth=2,
+            markersize=8,
+            label="p50",
+        )
        plt.xlabel("Prefix Ratio")
        plt.ylabel("Time to First Token (ms)")
        plt.title("TTFT vs Prefix Ratio")
        plt.grid(True, alpha=0.3)
-        for i, (pr, ttft) in enumerate(zip(prefix_ratios, ttft_values)):
+        plt.legend()
+        for i, (pr, p50) in enumerate(zip(prefix_ratios, ttft_p50_values)):
            plt.annotate(
-                f"{ttft:.1f}ms",
-                (pr, ttft),
+                f"{p50:.1f}ms",
+                (pr, p50),
                textcoords="offset points",
                xytext=(0, 10),
                ha="center",
            )

-        # Plot Throughput vs Prefix Ratio
+        # Plot ITL vs Prefix Ratio with shaded p25-p75 region
        plt.subplot(1, 2, 2)
-        plt.plot(prefix_ratios, throughput_values, "ro-", linewidth=2, markersize=8)
+        plt.fill_between(
+            prefix_ratios,
+            itl_p25_values,
+            itl_p75_values,
+            alpha=0.3,
+            color="red",
+            label="p25-p75",
+        )
+        plt.plot(
+            prefix_ratios, itl_p50_values, "ro-", linewidth=2, markersize=8, label="p50"
+        )
        plt.xlabel("Prefix Ratio")
-        plt.ylabel("Output Token Throughput (tokens/s)")
-        plt.title("Throughput vs Prefix Ratio")
+        plt.ylabel("Inter-Token Latency (ms)")
+        plt.title("ITL vs Prefix Ratio")
        plt.grid(True, alpha=0.3)
-        for i, (pr, thpt) in enumerate(zip(prefix_ratios, throughput_values)):
+        plt.legend()
+        for i, (pr, p50) in enumerate(zip(prefix_ratios, itl_p50_values)):
            plt.annotate(
-                f"{thpt:.1f}",
-                (pr, thpt),
+                f"{p50:.1f}ms",
+                (pr, p50),
                textcoords="offset points",
                xytext=(0, 10),
                ha="center",
@@ -409,8 +457,12 @@ def main():
        # Save results to JSON
        results_data = {
            "prefix_ratios": prefix_ratios,
-            "ttft_values": ttft_values,
-            "throughput_values": throughput_values,
+            "ttft_p25_values": ttft_p25_values,
+            "ttft_p50_values": ttft_p50_values,
+            "ttft_p75_values": ttft_p75_values,
+            "itl_p25_values": itl_p25_values,
+            "itl_p50_values": itl_p50_values,
+            "itl_p75_values": itl_p75_values,
            "config": {
                "model": args.model,
                "tokenizer": args.tokenizer,