[misc] benchmark_serving.py -- add ITL results and tweak TPOT results (#5263)

02cc3b51 · Tyler Michael Smith · GitHub · d5b1eb08 · 02cc3b51 · 02cc3b51
Unverified Commit 02cc3b51 authored Jun 05, 2024 by Tyler Michael Smith Committed by GitHub Jun 05, 2024
Show whitespace changes
Inline Side-by-side

Showing with 23 additions and 2 deletions

.buildkite/run-benchmarks.sh .buildkite/run-benchmarks.sh +1 -1

benchmarks/benchmark_serving.py benchmarks/benchmark_serving.py +22 -1

No files found.
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -50,7 +50,7 @@ echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
 echo '```' >> benchmark_results.md
-tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
+tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
 echo '```' >> benchmark_results.md
 # if the agent binary is not found, skip uploading the results, exit 0

--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -56,6 +56,9 @@ class BenchmarkMetrics:
    mean_tpot_ms: float
    median_tpot_ms: float
    p99_tpot_ms: float
+    mean_itl_ms: float
+    median_itl_ms: float
+    p99_itl_ms: float
 def sample_sharegpt_requests(
@@ -200,16 +203,24 @@ def calculate_metrics(
    actual_output_lens = []
    total_input = 0
    completed = 0
+    itls = []
    tpots = []
    ttfts = []
    for i in range(len(outputs)):
        if outputs[i].success:
-            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
+            # We use the tokenizer to count the number of output tokens for all
+            # serving backends instead of looking at len(outputs[i].itl) since
+            # multiple output tokens may be bundled together
+            # Note: this may inflate the output token count slightly
+            output_len = len(
+                tokenizer(outputs[i].generated_text,
+                          add_special_tokens=False).input_ids)
            actual_output_lens.append(output_len)
            total_input += input_requests[i][1]
            if output_len > 1:
                tpots.append(
                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
+            itls += outputs[i].itl
            ttfts.append(outputs[i].ttft)
            completed += 1
        else:
@@ -234,6 +245,9 @@ def calculate_metrics(
        mean_tpot_ms=np.mean(tpots or 0) * 1000,
        median_tpot_ms=np.median(tpots or 0) * 1000,
        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
    )
    return metrics, actual_output_lens
@@ -333,6 +347,10 @@ async def benchmark(
    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
                                    metrics.median_tpot_ms))
    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+    print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
+    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
+    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
    print("=" * 50)
    result = {
@@ -349,6 +367,9 @@ async def benchmark(
        "mean_tpot_ms": metrics.mean_tpot_ms,
        "median_tpot_ms": metrics.median_tpot_ms,
        "p99_tpot_ms": metrics.p99_tpot_ms,
+        "mean_itl_ms": metrics.mean_itl_ms,
+        "median_itl_ms": metrics.median_itl_ms,
+        "p99_itl_ms": metrics.p99_itl_ms,
        "input_lens": [output.prompt_len for output in outputs],
        "output_lens": actual_output_lens,
        "ttfts": [output.ttft for output in outputs],