"ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "c3bfbd2040032d9b5f6e52fa8e1bf7e327533e98"
Unverified Commit 8bd5966b authored by Yongming Ding's avatar Yongming Ding Committed by GitHub
Browse files

fix: auto-scale request count in benchmarks (#6777)


Signed-off-by: default avatarYongming Ding <yongmingd@nvidia.com>
parent 073fb437
...@@ -8,6 +8,12 @@ from typing import List ...@@ -8,6 +8,12 @@ from typing import List
# Default concurrency levels - can be overridden with CONCURRENCIES environment variable # Default concurrency levels - can be overridden with CONCURRENCIES environment variable
DEFAULT_CONCURRENCIES: List[int] = [1, 2, 5, 10, 50, 100, 250] DEFAULT_CONCURRENCIES: List[int] = [1, 2, 5, 10, 50, 100, 250]
# Default request count per concurrency level - can be overridden with REQUEST_COUNT env var
# When set to 0 or unset, defaults to max(concurrency * REQUEST_COUNT_SCALE_FACTOR, 10)
# to ensure the concurrency level is fully utilized and each slot runs enough requests
# for stable measurements
DEFAULT_REQUEST_COUNT: int = 0
REQUEST_COUNT_SCALE_FACTOR: int = 3
def get_concurrency_levels() -> List[int]: def get_concurrency_levels() -> List[int]:
...@@ -30,6 +36,24 @@ def get_concurrency_levels() -> List[int]: ...@@ -30,6 +36,24 @@ def get_concurrency_levels() -> List[int]:
return DEFAULT_CONCURRENCIES return DEFAULT_CONCURRENCIES
def get_request_count() -> int:
"""Get request count from environment variable or use default.
Returns 0 to indicate 'auto' mode (will be computed per concurrency level).
"""
request_count_env = os.getenv("REQUEST_COUNT")
if request_count_env:
try:
count = int(request_count_env.strip())
if count < 0:
raise ValueError(f"Request count must be non-negative, got: {count}")
return count
except ValueError as e:
print(f"WARNING: Invalid REQUEST_COUNT environment variable: {e}")
return DEFAULT_REQUEST_COUNT
return DEFAULT_REQUEST_COUNT
CONCURRENCIES: List[int] = get_concurrency_levels() CONCURRENCIES: List[int] = get_concurrency_levels()
...@@ -41,8 +65,21 @@ def run_aiperf( ...@@ -41,8 +65,21 @@ def run_aiperf(
stddev: int, stddev: int,
concurrency: int, concurrency: int,
output_dir: Path, output_dir: Path,
request_count: int = 0,
) -> None: ) -> None:
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
# Auto-compute request count: need enough requests to fully utilize concurrency
# and run each slot at least REQUEST_COUNT_SCALE_FACTOR times for stable measurements
if request_count <= 0:
request_count = max(concurrency * REQUEST_COUNT_SCALE_FACTOR, 10)
elif request_count < concurrency:
print(
f"WARNING: request_count ({request_count}) < concurrency ({concurrency}). "
f"Actual in-flight concurrency will be capped at {request_count}.",
flush=True,
)
cmd = [ cmd = [
"aiperf", "aiperf",
"profile", "profile",
...@@ -59,6 +96,8 @@ def run_aiperf( ...@@ -59,6 +96,8 @@ def run_aiperf(
str(stddev), str(stddev),
"--concurrency", "--concurrency",
str(concurrency), str(concurrency),
"--request-count",
str(request_count),
"--output-tokens-mean", "--output-tokens-mean",
str(osl), str(osl),
"--extra-inputs", "--extra-inputs",
...@@ -73,7 +112,7 @@ def run_aiperf( ...@@ -73,7 +112,7 @@ def run_aiperf(
str(output_dir), str(output_dir),
] ]
print( print(
f"Running aiperf with isl {isl}, osl {osl}, concurrency {concurrency}", f"Running aiperf with isl {isl}, osl {osl}, concurrency {concurrency}, request_count {request_count}",
flush=True, flush=True,
) )
...@@ -102,12 +141,26 @@ def run_concurrency_sweep( ...@@ -102,12 +141,26 @@ def run_concurrency_sweep(
service_url: str, model_name: str, isl: int, osl: int, stddev: int, output_dir: Path service_url: str, model_name: str, isl: int, osl: int, stddev: int, output_dir: Path
) -> None: ) -> None:
concurrency_levels = get_concurrency_levels() concurrency_levels = get_concurrency_levels()
request_count = get_request_count()
print( print(
f"Running concurrency sweep for {model_name} with ISL {isl} and OSL {osl} and standard deviation {stddev}", f"Running concurrency sweep for {model_name} with ISL {isl} and OSL {osl} and standard deviation {stddev}",
flush=True, flush=True,
) )
print(f"Concurrency levels: {concurrency_levels}", flush=True) print(f"Concurrency levels: {concurrency_levels}", flush=True)
print(
f"Request count: {request_count if request_count > 0 else f'auto (max(concurrency*{REQUEST_COUNT_SCALE_FACTOR}, 10))'}",
flush=True,
)
for c in concurrency_levels: for c in concurrency_levels:
print(f"Starting concurrency level {c}", flush=True) print(f"Starting concurrency level {c}", flush=True)
run_aiperf(service_url, model_name, isl, osl, stddev, c, output_dir / f"c{c}") run_aiperf(
service_url,
model_name,
isl,
osl,
stddev,
c,
output_dir / f"c{c}",
request_count=request_count,
)
...@@ -233,6 +233,24 @@ python3 -m benchmarks.utils.benchmark \ ...@@ -233,6 +233,24 @@ python3 -m benchmarks.utils.benchmark \
--endpoint-url http://localhost:8000 --endpoint-url http://localhost:8000
``` ```
### Request Count Configuration
The number of requests sent per concurrency level is auto-computed as `max(concurrency * 3, 10)` by default. This ensures each concurrency slot runs enough requests for stable measurements. You can override this with the `REQUEST_COUNT` environment variable:
```bash
# Fixed request count for all concurrency levels
REQUEST_COUNT=500 python3 -m benchmarks.utils.benchmark \
--benchmark-name my-test \
--endpoint-url http://localhost:8000
# Combined with custom concurrency levels
CONCURRENCIES="1,10,50,200" REQUEST_COUNT=1000 python3 -m benchmarks.utils.benchmark \
--benchmark-name high-load-test \
--endpoint-url http://localhost:8000
```
**Important**: The request count must be greater than or equal to the concurrency level. If the request count is too low, the actual in-flight concurrency will be capped at the request count, leading to inaccurate results at higher concurrency levels.
## Understanding Your Results ## Understanding Your Results
After benchmarking completes, check `./benchmarks/results/` (or your custom output directory): After benchmarking completes, check `./benchmarks/results/` (or your custom output directory):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment