Unverified Commit c18b4758 authored by Tzu-Ling Kan's avatar Tzu-Ling Kan Committed by GitHub
Browse files

feat: Use --request-rate and --request-rate-mode for aiper client (#6585)


Signed-off-by: default avatarTzu-Ling <tzulingk@nvidia.com>
parent 967e2961
......@@ -198,7 +198,7 @@ def run_aiperf(
output_dir: Path,
logger: logging.Logger,
max_retries: int = 1,
retry_delay: float = 1,
max_request_rate: float = 1.0,
continuous_load: bool = False,
) -> bool:
"""
......@@ -216,7 +216,7 @@ def run_aiperf(
output_dir: Directory for AI-Perf artifacts
logger: Logger instance
max_retries: Maximum number of retry attempts (default: 1)
retry_delay: Delay in seconds between retries (default: 1)
max_request_rate: Maximum requests per second for rate limiting (default: 1.0)
continuous_load: If True, use continuous load instead of fixed request count
Returns:
......@@ -248,6 +248,10 @@ def run_aiperf(
# Request parameters
"--concurrency",
"1", # Optional: we set to 1 for sequential
"--request-rate",
str(max_request_rate), # Rate limiting (requests/sec)
"--request-rate-mode",
"constant", # Use constant arrival pattern for predictable rate
# Token configuration
"--synthetic-input-tokens-mean",
str(input_token_length),
......@@ -279,11 +283,14 @@ def run_aiperf(
logger.info(f"Starting AI-Perf for Pod {pod_name} Local Port {port}")
logger.info(f"Using model name: {model}")
# Wait for model to be available
# Wait for model to be available initially
# Note: We only check once at start, then clients continue sending requests
# regardless of service health. This mimics real-world scenarios where clients
# don't know the server is down and continue retrying.
model_ready = wait_for_model_availability(url, endpoint, model, logger)
if not model_ready:
logger.warning("Model not ready, but proceeding with AI-Perf test anyway")
# This might result in all requests failing, but the retry logic will handle it
# Clients will continue attempting - measuring failure/recovery is the point
logger.info(f"Command: {' '.join(cmd)}")
......@@ -360,6 +367,7 @@ def run_aiperf(
# Sleep before next attempt (if not the last attempt and not continuous load)
if not success and attempt < max_attempts - 1 and not continuous_load:
retry_delay = 5 # Hardcoded delay between retry attempts
time.sleep(retry_delay)
if success and not continuous_load:
......@@ -510,7 +518,7 @@ def client(
input_token_length: int,
output_token_length: int,
max_retries: int,
retry_delay: float = 1,
max_request_rate: float = 1.0,
continuous_load: bool = False,
):
"""
......@@ -530,7 +538,7 @@ def client(
input_token_length: Number of input tokens per request
output_token_length: Number of output tokens per request
max_retries: Maximum retry attempts for AI-Perf execution
retry_delay: Delay in seconds between retry attempts
max_request_rate: Maximum requests per second for rate limiting (default: 1.0)
continuous_load: If True, use continuous load instead of fixed request count
"""
logger = logging.getLogger(f"CLIENT: {index}")
......@@ -577,7 +585,7 @@ def client(
output_dir=client_output_dir,
logger=logger,
max_retries=max_retries,
retry_delay=retry_delay,
max_request_rate=max_request_rate,
continuous_load=continuous_load,
)
......
......@@ -41,7 +41,7 @@ def get_client_function(client_type: str) -> Callable:
input_token_length,
output_token_length,
max_retries,
retry_delay_or_rate, # Differs between implementations
max_request_rate, # Used for request rate limiting in both implementations
continuous_load,
)
......@@ -108,12 +108,12 @@ def get_client_description(client_type: str) -> str:
"AI-Perf client: Uses the AI-Perf CLI tool for load generation. "
"Provides comprehensive metrics including P50/P90/P99 latencies, "
"TTFT (Time to First Token), ITL (Inter-Token Latency), and throughput. "
"Outputs results in JSON/CSV format with retry support at the test level."
"Outputs results in JSON/CSV format with request rate limiting and retry support."
),
"legacy": (
"Legacy custom client: Direct HTTP request loop with per-request retry logic. "
"Logs results in JSONL format with basic latency and status tracking. "
"Includes rate limiting and round-robin pod selection."
"Includes request rate limiting and round-robin pod selection."
),
}
......
......@@ -87,13 +87,8 @@ def _clients(
procs: list[SpawnProcess] = []
ctx = multiprocessing.get_context("spawn")
# Determine retry_delay_or_rate based on client type
if load_config.client_type == "legacy":
# Legacy client uses max_request_rate for rate limiting
retry_delay_or_rate = load_config.max_request_rate
else:
# AI-Perf client uses retry_delay between attempts (default 5s)
retry_delay_or_rate = 5
# Both client types use max_request_rate for rate limiting (requests/sec)
max_request_rate = load_config.max_request_rate
# Check if this is a continuous load test (rolling upgrade scenarios)
continuous_load = getattr(load_config, "continuous_load", False)
......@@ -122,7 +117,7 @@ def _clients(
load_config.overflow_token_length, # 2x max_seq_len tokens
load_config.output_token_length,
load_config.max_retries,
retry_delay_or_rate,
max_request_rate,
continuous_load,
),
)
......@@ -151,7 +146,7 @@ def _clients(
load_config.input_token_length, # Normal token count
load_config.output_token_length,
load_config.max_retries,
retry_delay_or_rate,
max_request_rate,
),
)
proc_normal.start()
......@@ -176,7 +171,7 @@ def _clients(
load_config.input_token_length,
load_config.output_token_length,
load_config.max_retries,
retry_delay_or_rate,
max_request_rate,
continuous_load, # Pass continuous_load flag
),
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment