Unverified Commit d83ab662 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore: make precentile plots in router benchmark (#5476)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 34c4882d
...@@ -84,6 +84,8 @@ def get_aiperf_cmd( ...@@ -84,6 +84,8 @@ def get_aiperf_cmd(
str(num_prefix_prompts), str(num_prefix_prompts),
"--artifact-dir", "--artifact-dir",
artifact_dir, artifact_dir,
"--dataset-sampling-strategy",
"shuffle",
"-H", "-H",
"Authorization: Bearer NOT USED", "Authorization: Bearer NOT USED",
"-H", "-H",
...@@ -157,20 +159,30 @@ def aggregate_results(results: List[Optional[Dict]]) -> Optional[Dict]: ...@@ -157,20 +159,30 @@ def aggregate_results(results: List[Optional[Dict]]) -> Optional[Dict]:
if not results: if not results:
return None return None
# For TTFT, we take the average across all URLs valid_results = [r for r in results if r is not None]
# For throughput, we sum across all URLs (total system throughput) if not valid_results:
ttft_values = [r["time_to_first_token"]["avg"] for r in results if r is not None]
throughput_values = [
r["output_token_throughput"]["avg"] for r in results if r is not None
]
if not ttft_values or not throughput_values:
return None return None
# For TTFT percentiles, average across URLs
ttft_p25_values = [r["time_to_first_token"]["p25"] for r in valid_results]
ttft_p50_values = [r["time_to_first_token"]["p50"] for r in valid_results]
ttft_p75_values = [r["time_to_first_token"]["p75"] for r in valid_results]
# For ITL percentiles, average across URLs
itl_p25_values = [r["inter_token_latency"]["p25"] for r in valid_results]
itl_p50_values = [r["inter_token_latency"]["p50"] for r in valid_results]
itl_p75_values = [r["inter_token_latency"]["p75"] for r in valid_results]
aggregated = { aggregated = {
"time_to_first_token": {"avg": sum(ttft_values) / len(ttft_values)}, "time_to_first_token": {
"output_token_throughput": { "p25": sum(ttft_p25_values) / len(ttft_p25_values),
"avg": sum(throughput_values) # Total throughput across all URLs "p50": sum(ttft_p50_values) / len(ttft_p50_values),
"p75": sum(ttft_p75_values) / len(ttft_p75_values),
},
"inter_token_latency": {
"p25": sum(itl_p25_values) / len(itl_p25_values),
"p50": sum(itl_p50_values) / len(itl_p50_values),
"p75": sum(itl_p75_values) / len(itl_p75_values),
}, },
} }
...@@ -328,8 +340,12 @@ def main(): ...@@ -328,8 +340,12 @@ def main():
# Store results # Store results
prefix_ratios = [] prefix_ratios = []
ttft_values = [] ttft_p25_values = []
throughput_values = [] ttft_p50_values = []
ttft_p75_values = []
itl_p25_values = []
itl_p50_values = []
itl_p75_values = []
current_seed = args.seed current_seed = args.seed
...@@ -350,50 +366,82 @@ def main(): ...@@ -350,50 +366,82 @@ def main():
) )
if result is not None: if result is not None:
ttft = result["time_to_first_token"]["avg"] ttft = result["time_to_first_token"]
throughput = result["output_token_throughput"]["avg"] itl = result["inter_token_latency"]
prefix_ratios.append(prefix_ratio) prefix_ratios.append(prefix_ratio)
ttft_values.append(ttft) ttft_p25_values.append(ttft["p25"])
throughput_values.append(throughput) ttft_p50_values.append(ttft["p50"])
ttft_p75_values.append(ttft["p75"])
itl_p25_values.append(itl["p25"])
itl_p50_values.append(itl["p50"])
itl_p75_values.append(itl["p75"])
logger.info( logger.info(
f"Prefix ratio {prefix_ratio}: TTFT={ttft:.2f}ms, Throughput={throughput:.2f} tokens/s" f"Prefix ratio {prefix_ratio}: TTFT p50={ttft['p50']:.2f}ms (p25={ttft['p25']:.2f}, p75={ttft['p75']:.2f}), "
f"ITL p50={itl['p50']:.2f}ms (p25={itl['p25']:.2f}, p75={itl['p75']:.2f})"
) )
current_seed += 1 current_seed += 1
# Create plots # Create plots
if prefix_ratios and ttft_values and throughput_values: if prefix_ratios and ttft_p50_values and itl_p50_values:
# Plot TTFT vs Prefix Ratio
plt.figure(figsize=(12, 5)) plt.figure(figsize=(12, 5))
# Plot TTFT vs Prefix Ratio with shaded p25-p75 region
plt.subplot(1, 2, 1) plt.subplot(1, 2, 1)
plt.plot(prefix_ratios, ttft_values, "bo-", linewidth=2, markersize=8) plt.fill_between(
prefix_ratios,
ttft_p25_values,
ttft_p75_values,
alpha=0.3,
color="blue",
label="p25-p75",
)
plt.plot(
prefix_ratios,
ttft_p50_values,
"bo-",
linewidth=2,
markersize=8,
label="p50",
)
plt.xlabel("Prefix Ratio") plt.xlabel("Prefix Ratio")
plt.ylabel("Time to First Token (ms)") plt.ylabel("Time to First Token (ms)")
plt.title("TTFT vs Prefix Ratio") plt.title("TTFT vs Prefix Ratio")
plt.grid(True, alpha=0.3) plt.grid(True, alpha=0.3)
for i, (pr, ttft) in enumerate(zip(prefix_ratios, ttft_values)): plt.legend()
for i, (pr, p50) in enumerate(zip(prefix_ratios, ttft_p50_values)):
plt.annotate( plt.annotate(
f"{ttft:.1f}ms", f"{p50:.1f}ms",
(pr, ttft), (pr, p50),
textcoords="offset points", textcoords="offset points",
xytext=(0, 10), xytext=(0, 10),
ha="center", ha="center",
) )
# Plot Throughput vs Prefix Ratio # Plot ITL vs Prefix Ratio with shaded p25-p75 region
plt.subplot(1, 2, 2) plt.subplot(1, 2, 2)
plt.plot(prefix_ratios, throughput_values, "ro-", linewidth=2, markersize=8) plt.fill_between(
prefix_ratios,
itl_p25_values,
itl_p75_values,
alpha=0.3,
color="red",
label="p25-p75",
)
plt.plot(
prefix_ratios, itl_p50_values, "ro-", linewidth=2, markersize=8, label="p50"
)
plt.xlabel("Prefix Ratio") plt.xlabel("Prefix Ratio")
plt.ylabel("Output Token Throughput (tokens/s)") plt.ylabel("Inter-Token Latency (ms)")
plt.title("Throughput vs Prefix Ratio") plt.title("ITL vs Prefix Ratio")
plt.grid(True, alpha=0.3) plt.grid(True, alpha=0.3)
for i, (pr, thpt) in enumerate(zip(prefix_ratios, throughput_values)): plt.legend()
for i, (pr, p50) in enumerate(zip(prefix_ratios, itl_p50_values)):
plt.annotate( plt.annotate(
f"{thpt:.1f}", f"{p50:.1f}ms",
(pr, thpt), (pr, p50),
textcoords="offset points", textcoords="offset points",
xytext=(0, 10), xytext=(0, 10),
ha="center", ha="center",
...@@ -409,8 +457,12 @@ def main(): ...@@ -409,8 +457,12 @@ def main():
# Save results to JSON # Save results to JSON
results_data = { results_data = {
"prefix_ratios": prefix_ratios, "prefix_ratios": prefix_ratios,
"ttft_values": ttft_values, "ttft_p25_values": ttft_p25_values,
"throughput_values": throughput_values, "ttft_p50_values": ttft_p50_values,
"ttft_p75_values": ttft_p75_values,
"itl_p25_values": itl_p25_values,
"itl_p50_values": itl_p50_values,
"itl_p75_values": itl_p75_values,
"config": { "config": {
"model": args.model, "model": args.model,
"tokenizer": args.tokenizer, "tokenizer": args.tokenizer,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment