feat: Use --request-rate and --request-rate-mode for aiper client (#6585)

Signed-off-by: Tzu-Ling <tzulingk@nvidia.com>

feat: Use --request-rate and --request-rate-mode for aiper client (#6585)
Signed-off-by: Tzu-Ling <tzulingk@nvidia.com>
c18b4758 · Tzu-Ling Kan · GitHub · 967e2961 · c18b4758 · c18b4758
Unverified Commit c18b4758 authored Feb 25, 2026 by Tzu-Ling Kan Committed by GitHub Feb 25, 2026
3 changed files
--- a/tests/fault_tolerance/deploy/client.py
+++ b/tests/fault_tolerance/deploy/client.py
@@ -198,7 +198,7 @@ def run_aiperf(
    output_dir: Path,
    logger: logging.Logger,
    max_retries: int = 1,
-    retry_delay: float = 1,
+    max_request_rate: float = 1.0,
    continuous_load: bool = False,
 ) -> bool:
    """
@@ -216,7 +216,7 @@ def run_aiperf(
        output_dir: Directory for AI-Perf artifacts
        logger: Logger instance
        max_retries: Maximum number of retry attempts (default: 1)
-        retry_delay: Delay in seconds between retries (default: 1)
+        max_request_rate: Maximum requests per second for rate limiting (default: 1.0)
        continuous_load: If True, use continuous load instead of fixed request count

    Returns:
@@ -248,6 +248,10 @@ def run_aiperf(
        # Request parameters
        "--concurrency",
        "1",  # Optional: we set to 1 for sequential
+        "--request-rate",
+        str(max_request_rate),  # Rate limiting (requests/sec)
+        "--request-rate-mode",
+        "constant",  # Use constant arrival pattern for predictable rate
        # Token configuration
        "--synthetic-input-tokens-mean",
        str(input_token_length),
@@ -279,11 +283,14 @@ def run_aiperf(
    logger.info(f"Starting AI-Perf for Pod {pod_name} Local Port {port}")
    logger.info(f"Using model name: {model}")

-    # Wait for model to be available
+    # Wait for model to be available initially
+    # Note: We only check once at start, then clients continue sending requests
+    # regardless of service health. This mimics real-world scenarios where clients
+    # don't know the server is down and continue retrying.
    model_ready = wait_for_model_availability(url, endpoint, model, logger)
    if not model_ready:
        logger.warning("Model not ready, but proceeding with AI-Perf test anyway")
-        # This might result in all requests failing, but the retry logic will handle it
+        # Clients will continue attempting - measuring failure/recovery is the point

    logger.info(f"Command: {' '.join(cmd)}")

@@ -360,6 +367,7 @@ def run_aiperf(

        # Sleep before next attempt (if not the last attempt and not continuous load)
        if not success and attempt < max_attempts - 1 and not continuous_load:
+            retry_delay = 5  # Hardcoded delay between retry attempts
            time.sleep(retry_delay)

    if success and not continuous_load:
@@ -510,7 +518,7 @@ def client(
    input_token_length: int,
    output_token_length: int,
    max_retries: int,
-    retry_delay: float = 1,
+    max_request_rate: float = 1.0,
    continuous_load: bool = False,
 ):
    """
@@ -530,7 +538,7 @@ def client(
        input_token_length: Number of input tokens per request
        output_token_length: Number of output tokens per request
        max_retries: Maximum retry attempts for AI-Perf execution
-        retry_delay: Delay in seconds between retry attempts
+        max_request_rate: Maximum requests per second for rate limiting (default: 1.0)
        continuous_load: If True, use continuous load instead of fixed request count
    """
    logger = logging.getLogger(f"CLIENT: {index}")
@@ -577,7 +585,7 @@ def client(
            output_dir=client_output_dir,
            logger=logger,
            max_retries=max_retries,
-            retry_delay=retry_delay,
+            max_request_rate=max_request_rate,
            continuous_load=continuous_load,
        )


--- a/tests/fault_tolerance/deploy/client_factory.py
+++ b/tests/fault_tolerance/deploy/client_factory.py
@@ -41,7 +41,7 @@ def get_client_function(client_type: str) -> Callable:
            input_token_length,
            output_token_length,
            max_retries,
-            retry_delay_or_rate,  # Differs between implementations
+            max_request_rate,  # Used for request rate limiting in both implementations
            continuous_load,
        )

@@ -108,12 +108,12 @@ def get_client_description(client_type: str) -> str:
            "AI-Perf client: Uses the AI-Perf CLI tool for load generation. "
            "Provides comprehensive metrics including P50/P90/P99 latencies, "
            "TTFT (Time to First Token), ITL (Inter-Token Latency), and throughput. "
-            "Outputs results in JSON/CSV format with retry support at the test level."
+            "Outputs results in JSON/CSV format with request rate limiting and retry support."
        ),
        "legacy": (
            "Legacy custom client: Direct HTTP request loop with per-request retry logic. "
            "Logs results in JSONL format with basic latency and status tracking. "
-            "Includes rate limiting and round-robin pod selection."
+            "Includes request rate limiting and round-robin pod selection."
        ),
    }


--- a/tests/fault_tolerance/deploy/test_deployment.py
+++ b/tests/fault_tolerance/deploy/test_deployment.py
@@ -87,13 +87,8 @@ def _clients(
    procs: list[SpawnProcess] = []
    ctx = multiprocessing.get_context("spawn")

-    # Determine retry_delay_or_rate based on client type
-    if load_config.client_type == "legacy":
-        # Legacy client uses max_request_rate for rate limiting
-        retry_delay_or_rate = load_config.max_request_rate
-    else:
-        # AI-Perf client uses retry_delay between attempts (default 5s)
-        retry_delay_or_rate = 5
+    # Both client types use max_request_rate for rate limiting (requests/sec)
+    max_request_rate = load_config.max_request_rate

    # Check if this is a continuous load test (rolling upgrade scenarios)
    continuous_load = getattr(load_config, "continuous_load", False)
@@ -122,7 +117,7 @@ def _clients(
                    load_config.overflow_token_length,  # 2x max_seq_len tokens
                    load_config.output_token_length,
                    load_config.max_retries,
-                    retry_delay_or_rate,
+                    max_request_rate,
                    continuous_load,
                ),
            )
@@ -151,7 +146,7 @@ def _clients(
                    load_config.input_token_length,  # Normal token count
                    load_config.output_token_length,
                    load_config.max_retries,
-                    retry_delay_or_rate,
+                    max_request_rate,
                ),
            )
            proc_normal.start()
@@ -176,7 +171,7 @@ def _clients(
                        load_config.input_token_length,
                        load_config.output_token_length,
                        load_config.max_retries,
-                        retry_delay_or_rate,
+                        max_request_rate,
                        continuous_load,  # Pass continuous_load flag
                    ),
                )