fix: auto-scale request count in benchmarks (#6777)

Signed-off-by: Yongming Ding <yongmingd@nvidia.com>

fix: auto-scale request count in benchmarks (#6777)
Signed-off-by: Yongming Ding <yongmingd@nvidia.com>
8bd5966b · Yongming Ding · GitHub · 073fb437 · 8bd5966b · 8bd5966b
Unverified Commit 8bd5966b authored Mar 02, 2026 by Yongming Ding Committed by GitHub Mar 02, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 73 additions and 2 deletions

benchmarks/utils/aiperf.py benchmarks/utils/aiperf.py +55 -2

docs/benchmarks/benchmarking.md docs/benchmarks/benchmarking.md +18 -0

No files found.
--- a/benchmarks/utils/aiperf.py
+++ b/benchmarks/utils/aiperf.py
@@ -8,6 +8,12 @@ from typing import List

 # Default concurrency levels - can be overridden with CONCURRENCIES environment variable
 DEFAULT_CONCURRENCIES: List[int] = [1, 2, 5, 10, 50, 100, 250]
+# Default request count per concurrency level - can be overridden with REQUEST_COUNT env var
+# When set to 0 or unset, defaults to max(concurrency * REQUEST_COUNT_SCALE_FACTOR, 10)
+# to ensure the concurrency level is fully utilized and each slot runs enough requests
+# for stable measurements
+DEFAULT_REQUEST_COUNT: int = 0
+REQUEST_COUNT_SCALE_FACTOR: int = 3


 def get_concurrency_levels() -> List[int]:
@@ -30,6 +36,24 @@ def get_concurrency_levels() -> List[int]:
    return DEFAULT_CONCURRENCIES


+def get_request_count() -> int:
+    """Get request count from environment variable or use default.
+
+    Returns 0 to indicate 'auto' mode (will be computed per concurrency level).
+    """
+    request_count_env = os.getenv("REQUEST_COUNT")
+    if request_count_env:
+        try:
+            count = int(request_count_env.strip())
+            if count < 0:
+                raise ValueError(f"Request count must be non-negative, got: {count}")
+            return count
+        except ValueError as e:
+            print(f"WARNING: Invalid REQUEST_COUNT environment variable: {e}")
+            return DEFAULT_REQUEST_COUNT
+    return DEFAULT_REQUEST_COUNT
+
+
 CONCURRENCIES: List[int] = get_concurrency_levels()


@@ -41,8 +65,21 @@ def run_aiperf(
    stddev: int,
    concurrency: int,
    output_dir: Path,
+    request_count: int = 0,
 ) -> None:
    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Auto-compute request count: need enough requests to fully utilize concurrency
+    # and run each slot at least REQUEST_COUNT_SCALE_FACTOR times for stable measurements
+    if request_count <= 0:
+        request_count = max(concurrency * REQUEST_COUNT_SCALE_FACTOR, 10)
+    elif request_count < concurrency:
+        print(
+            f"WARNING: request_count ({request_count}) < concurrency ({concurrency}). "
+            f"Actual in-flight concurrency will be capped at {request_count}.",
+            flush=True,
+        )
+
    cmd = [
        "aiperf",
        "profile",
@@ -59,6 +96,8 @@ def run_aiperf(
        str(stddev),
        "--concurrency",
        str(concurrency),
+        "--request-count",
+        str(request_count),
        "--output-tokens-mean",
        str(osl),
        "--extra-inputs",
@@ -73,7 +112,7 @@ def run_aiperf(
        str(output_dir),
    ]
    print(
-        f"Running aiperf with isl {isl}, osl {osl}, concurrency {concurrency}",
+        f"Running aiperf with isl {isl}, osl {osl}, concurrency {concurrency}, request_count {request_count}",
        flush=True,
    )

@@ -102,12 +141,26 @@ def run_concurrency_sweep(
    service_url: str, model_name: str, isl: int, osl: int, stddev: int, output_dir: Path
 ) -> None:
    concurrency_levels = get_concurrency_levels()
+    request_count = get_request_count()
    print(
        f"Running concurrency sweep for {model_name} with ISL {isl} and OSL {osl} and standard deviation {stddev}",
        flush=True,
    )
    print(f"Concurrency levels: {concurrency_levels}", flush=True)
+    print(
+        f"Request count: {request_count if request_count > 0 else f'auto (max(concurrency*{REQUEST_COUNT_SCALE_FACTOR}, 10))'}",
+        flush=True,
+    )

    for c in concurrency_levels:
        print(f"Starting concurrency level {c}", flush=True)
-        run_aiperf(service_url, model_name, isl, osl, stddev, c, output_dir / f"c{c}")
+        run_aiperf(
+            service_url,
+            model_name,
+            isl,
+            osl,
+            stddev,
+            c,
+            output_dir / f"c{c}",
+            request_count=request_count,
+        )
--- a/docs/benchmarks/benchmarking.md
+++ b/docs/benchmarks/benchmarking.md
@@ -233,6 +233,24 @@ python3 -m benchmarks.utils.benchmark \
    --endpoint-url http://localhost:8000
 ```

+### Request Count Configuration
+
+The number of requests sent per concurrency level is auto-computed as `max(concurrency * 3, 10)` by default. This ensures each concurrency slot runs enough requests for stable measurements. You can override this with the `REQUEST_COUNT` environment variable:
+
+```bash
+# Fixed request count for all concurrency levels
+REQUEST_COUNT=500 python3 -m benchmarks.utils.benchmark \
+    --benchmark-name my-test \
+    --endpoint-url http://localhost:8000
+
+# Combined with custom concurrency levels
+CONCURRENCIES="1,10,50,200" REQUEST_COUNT=1000 python3 -m benchmarks.utils.benchmark \
+    --benchmark-name high-load-test \
+    --endpoint-url http://localhost:8000
+```
+
+**Important**: The request count must be greater than or equal to the concurrency level. If the request count is too low, the actual in-flight concurrency will be capped at the request count, leading to inaccurate results at higher concurrency levels.
+
 ## Understanding Your Results

 After benchmarking completes, check `./benchmarks/results/` (or your custom output directory):