Unverified Commit c18b4758 authored by Tzu-Ling Kan's avatar Tzu-Ling Kan Committed by GitHub
Browse files

feat: Use --request-rate and --request-rate-mode for aiper client (#6585)


Signed-off-by: default avatarTzu-Ling <tzulingk@nvidia.com>
parent 967e2961
...@@ -198,7 +198,7 @@ def run_aiperf( ...@@ -198,7 +198,7 @@ def run_aiperf(
output_dir: Path, output_dir: Path,
logger: logging.Logger, logger: logging.Logger,
max_retries: int = 1, max_retries: int = 1,
retry_delay: float = 1, max_request_rate: float = 1.0,
continuous_load: bool = False, continuous_load: bool = False,
) -> bool: ) -> bool:
""" """
...@@ -216,7 +216,7 @@ def run_aiperf( ...@@ -216,7 +216,7 @@ def run_aiperf(
output_dir: Directory for AI-Perf artifacts output_dir: Directory for AI-Perf artifacts
logger: Logger instance logger: Logger instance
max_retries: Maximum number of retry attempts (default: 1) max_retries: Maximum number of retry attempts (default: 1)
retry_delay: Delay in seconds between retries (default: 1) max_request_rate: Maximum requests per second for rate limiting (default: 1.0)
continuous_load: If True, use continuous load instead of fixed request count continuous_load: If True, use continuous load instead of fixed request count
Returns: Returns:
...@@ -248,6 +248,10 @@ def run_aiperf( ...@@ -248,6 +248,10 @@ def run_aiperf(
# Request parameters # Request parameters
"--concurrency", "--concurrency",
"1", # Optional: we set to 1 for sequential "1", # Optional: we set to 1 for sequential
"--request-rate",
str(max_request_rate), # Rate limiting (requests/sec)
"--request-rate-mode",
"constant", # Use constant arrival pattern for predictable rate
# Token configuration # Token configuration
"--synthetic-input-tokens-mean", "--synthetic-input-tokens-mean",
str(input_token_length), str(input_token_length),
...@@ -279,11 +283,14 @@ def run_aiperf( ...@@ -279,11 +283,14 @@ def run_aiperf(
logger.info(f"Starting AI-Perf for Pod {pod_name} Local Port {port}") logger.info(f"Starting AI-Perf for Pod {pod_name} Local Port {port}")
logger.info(f"Using model name: {model}") logger.info(f"Using model name: {model}")
# Wait for model to be available # Wait for model to be available initially
# Note: We only check once at start, then clients continue sending requests
# regardless of service health. This mimics real-world scenarios where clients
# don't know the server is down and continue retrying.
model_ready = wait_for_model_availability(url, endpoint, model, logger) model_ready = wait_for_model_availability(url, endpoint, model, logger)
if not model_ready: if not model_ready:
logger.warning("Model not ready, but proceeding with AI-Perf test anyway") logger.warning("Model not ready, but proceeding with AI-Perf test anyway")
# This might result in all requests failing, but the retry logic will handle it # Clients will continue attempting - measuring failure/recovery is the point
logger.info(f"Command: {' '.join(cmd)}") logger.info(f"Command: {' '.join(cmd)}")
...@@ -360,6 +367,7 @@ def run_aiperf( ...@@ -360,6 +367,7 @@ def run_aiperf(
# Sleep before next attempt (if not the last attempt and not continuous load) # Sleep before next attempt (if not the last attempt and not continuous load)
if not success and attempt < max_attempts - 1 and not continuous_load: if not success and attempt < max_attempts - 1 and not continuous_load:
retry_delay = 5 # Hardcoded delay between retry attempts
time.sleep(retry_delay) time.sleep(retry_delay)
if success and not continuous_load: if success and not continuous_load:
...@@ -510,7 +518,7 @@ def client( ...@@ -510,7 +518,7 @@ def client(
input_token_length: int, input_token_length: int,
output_token_length: int, output_token_length: int,
max_retries: int, max_retries: int,
retry_delay: float = 1, max_request_rate: float = 1.0,
continuous_load: bool = False, continuous_load: bool = False,
): ):
""" """
...@@ -530,7 +538,7 @@ def client( ...@@ -530,7 +538,7 @@ def client(
input_token_length: Number of input tokens per request input_token_length: Number of input tokens per request
output_token_length: Number of output tokens per request output_token_length: Number of output tokens per request
max_retries: Maximum retry attempts for AI-Perf execution max_retries: Maximum retry attempts for AI-Perf execution
retry_delay: Delay in seconds between retry attempts max_request_rate: Maximum requests per second for rate limiting (default: 1.0)
continuous_load: If True, use continuous load instead of fixed request count continuous_load: If True, use continuous load instead of fixed request count
""" """
logger = logging.getLogger(f"CLIENT: {index}") logger = logging.getLogger(f"CLIENT: {index}")
...@@ -577,7 +585,7 @@ def client( ...@@ -577,7 +585,7 @@ def client(
output_dir=client_output_dir, output_dir=client_output_dir,
logger=logger, logger=logger,
max_retries=max_retries, max_retries=max_retries,
retry_delay=retry_delay, max_request_rate=max_request_rate,
continuous_load=continuous_load, continuous_load=continuous_load,
) )
......
...@@ -41,7 +41,7 @@ def get_client_function(client_type: str) -> Callable: ...@@ -41,7 +41,7 @@ def get_client_function(client_type: str) -> Callable:
input_token_length, input_token_length,
output_token_length, output_token_length,
max_retries, max_retries,
retry_delay_or_rate, # Differs between implementations max_request_rate, # Used for request rate limiting in both implementations
continuous_load, continuous_load,
) )
...@@ -108,12 +108,12 @@ def get_client_description(client_type: str) -> str: ...@@ -108,12 +108,12 @@ def get_client_description(client_type: str) -> str:
"AI-Perf client: Uses the AI-Perf CLI tool for load generation. " "AI-Perf client: Uses the AI-Perf CLI tool for load generation. "
"Provides comprehensive metrics including P50/P90/P99 latencies, " "Provides comprehensive metrics including P50/P90/P99 latencies, "
"TTFT (Time to First Token), ITL (Inter-Token Latency), and throughput. " "TTFT (Time to First Token), ITL (Inter-Token Latency), and throughput. "
"Outputs results in JSON/CSV format with retry support at the test level." "Outputs results in JSON/CSV format with request rate limiting and retry support."
), ),
"legacy": ( "legacy": (
"Legacy custom client: Direct HTTP request loop with per-request retry logic. " "Legacy custom client: Direct HTTP request loop with per-request retry logic. "
"Logs results in JSONL format with basic latency and status tracking. " "Logs results in JSONL format with basic latency and status tracking. "
"Includes rate limiting and round-robin pod selection." "Includes request rate limiting and round-robin pod selection."
), ),
} }
......
...@@ -87,13 +87,8 @@ def _clients( ...@@ -87,13 +87,8 @@ def _clients(
procs: list[SpawnProcess] = [] procs: list[SpawnProcess] = []
ctx = multiprocessing.get_context("spawn") ctx = multiprocessing.get_context("spawn")
# Determine retry_delay_or_rate based on client type # Both client types use max_request_rate for rate limiting (requests/sec)
if load_config.client_type == "legacy": max_request_rate = load_config.max_request_rate
# Legacy client uses max_request_rate for rate limiting
retry_delay_or_rate = load_config.max_request_rate
else:
# AI-Perf client uses retry_delay between attempts (default 5s)
retry_delay_or_rate = 5
# Check if this is a continuous load test (rolling upgrade scenarios) # Check if this is a continuous load test (rolling upgrade scenarios)
continuous_load = getattr(load_config, "continuous_load", False) continuous_load = getattr(load_config, "continuous_load", False)
...@@ -122,7 +117,7 @@ def _clients( ...@@ -122,7 +117,7 @@ def _clients(
load_config.overflow_token_length, # 2x max_seq_len tokens load_config.overflow_token_length, # 2x max_seq_len tokens
load_config.output_token_length, load_config.output_token_length,
load_config.max_retries, load_config.max_retries,
retry_delay_or_rate, max_request_rate,
continuous_load, continuous_load,
), ),
) )
...@@ -151,7 +146,7 @@ def _clients( ...@@ -151,7 +146,7 @@ def _clients(
load_config.input_token_length, # Normal token count load_config.input_token_length, # Normal token count
load_config.output_token_length, load_config.output_token_length,
load_config.max_retries, load_config.max_retries,
retry_delay_or_rate, max_request_rate,
), ),
) )
proc_normal.start() proc_normal.start()
...@@ -176,7 +171,7 @@ def _clients( ...@@ -176,7 +171,7 @@ def _clients(
load_config.input_token_length, load_config.input_token_length,
load_config.output_token_length, load_config.output_token_length,
load_config.max_retries, load_config.max_retries,
retry_delay_or_rate, max_request_rate,
continuous_load, # Pass continuous_load flag continuous_load, # Pass continuous_load flag
), ),
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment