fix: profiling script missing tests when kv cache is tight (#2567)

b98188c8 · Hongkuan Zhou · GitHub · 19f8eb00 · b98188c8
Unverified Commit b98188c8 authored Aug 21, 2025 by Hongkuan Zhou Committed by GitHub Aug 21, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 7 deletions

benchmarks/profiler/utils/profile_decode.py benchmarks/profiler/utils/profile_decode.py +14 -7

No files found.
--- a/benchmarks/profiler/utils/profile_decode.py
+++ b/benchmarks/profiler/utils/profile_decode.py
@@ -42,18 +42,25 @@ def profile_decode(
        (max_context_length - osl) // interpolation_granularity,
    ):
        max_concurrency = max_kv_tokens // (isl + osl)
-        if max_concurrency // interpolation_granularity == 0:
+        if max_concurrency == 0:
+            logger.warning(
+                f"max_kv_tokens {max_kv_tokens} is too small for"
+                f" isl {isl} + osl {osl}, skipping."
+            )
+            break
+        elif max_concurrency < interpolation_granularity:
            logger.warning(
                f"max_concurrency {max_concurrency} is too small for"
                f" interpolation granularity {interpolation_granularity}."
                f" max_kv_tokens {max_kv_tokens}, isl {isl}, osl {osl}"
            )
-            break
-        sweep_num_request = range(
-            1,
-            max_concurrency,
-            max_concurrency // interpolation_granularity,
-        )
+            sweep_num_request = range(1, max_concurrency + 1)
+        else:
+            sweep_num_request = range(
+                1,
+                max_concurrency,
+                max_concurrency // interpolation_granularity,
+            )
        for num_request in sweep_num_request:
            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
            gap_result = benchmark_decode(