Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
c18b4758
Unverified
Commit
c18b4758
authored
Feb 25, 2026
by
Tzu-Ling Kan
Committed by
GitHub
Feb 25, 2026
Browse files
feat: Use --request-rate and --request-rate-mode for aiper client (#6585)
Signed-off-by:
Tzu-Ling
<
tzulingk@nvidia.com
>
parent
967e2961
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
23 additions
and
20 deletions
+23
-20
tests/fault_tolerance/deploy/client.py
tests/fault_tolerance/deploy/client.py
+15
-7
tests/fault_tolerance/deploy/client_factory.py
tests/fault_tolerance/deploy/client_factory.py
+3
-3
tests/fault_tolerance/deploy/test_deployment.py
tests/fault_tolerance/deploy/test_deployment.py
+5
-10
No files found.
tests/fault_tolerance/deploy/client.py
View file @
c18b4758
...
...
@@ -198,7 +198,7 @@ def run_aiperf(
output_dir
:
Path
,
logger
:
logging
.
Logger
,
max_retries
:
int
=
1
,
retry_delay
:
float
=
1
,
max_request_rate
:
float
=
1
.0
,
continuous_load
:
bool
=
False
,
)
->
bool
:
"""
...
...
@@ -216,7 +216,7 @@ def run_aiperf(
output_dir: Directory for AI-Perf artifacts
logger: Logger instance
max_retries: Maximum number of retry attempts (default: 1)
retry_delay: Delay in seconds between retries
(default: 1)
max_request_rate: Maximum requests per second for rate limiting
(default: 1
.0
)
continuous_load: If True, use continuous load instead of fixed request count
Returns:
...
...
@@ -248,6 +248,10 @@ def run_aiperf(
# Request parameters
"--concurrency"
,
"1"
,
# Optional: we set to 1 for sequential
"--request-rate"
,
str
(
max_request_rate
),
# Rate limiting (requests/sec)
"--request-rate-mode"
,
"constant"
,
# Use constant arrival pattern for predictable rate
# Token configuration
"--synthetic-input-tokens-mean"
,
str
(
input_token_length
),
...
...
@@ -279,11 +283,14 @@ def run_aiperf(
logger
.
info
(
f
"Starting AI-Perf for Pod
{
pod_name
}
Local Port
{
port
}
"
)
logger
.
info
(
f
"Using model name:
{
model
}
"
)
# Wait for model to be available
# Wait for model to be available initially
# Note: We only check once at start, then clients continue sending requests
# regardless of service health. This mimics real-world scenarios where clients
# don't know the server is down and continue retrying.
model_ready
=
wait_for_model_availability
(
url
,
endpoint
,
model
,
logger
)
if
not
model_ready
:
logger
.
warning
(
"Model not ready, but proceeding with AI-Perf test anyway"
)
#
This might result in all requests failing, but the retry logic will handle i
t
#
Clients will continue attempting - measuring failure/recovery is the poin
t
logger
.
info
(
f
"Command:
{
' '
.
join
(
cmd
)
}
"
)
...
...
@@ -360,6 +367,7 @@ def run_aiperf(
# Sleep before next attempt (if not the last attempt and not continuous load)
if
not
success
and
attempt
<
max_attempts
-
1
and
not
continuous_load
:
retry_delay
=
5
# Hardcoded delay between retry attempts
time
.
sleep
(
retry_delay
)
if
success
and
not
continuous_load
:
...
...
@@ -510,7 +518,7 @@ def client(
input_token_length
:
int
,
output_token_length
:
int
,
max_retries
:
int
,
retry_delay
:
float
=
1
,
max_request_rate
:
float
=
1
.0
,
continuous_load
:
bool
=
False
,
):
"""
...
...
@@ -530,7 +538,7 @@ def client(
input_token_length: Number of input tokens per request
output_token_length: Number of output tokens per request
max_retries: Maximum retry attempts for AI-Perf execution
retry_delay: Delay in seconds between retry attempts
max_request_rate: Maximum requests per second for rate limiting (default: 1.0)
continuous_load: If True, use continuous load instead of fixed request count
"""
logger
=
logging
.
getLogger
(
f
"CLIENT:
{
index
}
"
)
...
...
@@ -577,7 +585,7 @@ def client(
output_dir
=
client_output_dir
,
logger
=
logger
,
max_retries
=
max_retries
,
retry_delay
=
retry_delay
,
max_request_rate
=
max_request_rate
,
continuous_load
=
continuous_load
,
)
...
...
tests/fault_tolerance/deploy/client_factory.py
View file @
c18b4758
...
...
@@ -41,7 +41,7 @@ def get_client_function(client_type: str) -> Callable:
input_token_length,
output_token_length,
max_retries,
retry_delay_or_rate, # Differs between
implementations
max_request_rate, # Used for request rate limiting in both
implementations
continuous_load,
)
...
...
@@ -108,12 +108,12 @@ def get_client_description(client_type: str) -> str:
"AI-Perf client: Uses the AI-Perf CLI tool for load generation. "
"Provides comprehensive metrics including P50/P90/P99 latencies, "
"TTFT (Time to First Token), ITL (Inter-Token Latency), and throughput. "
"Outputs results in JSON/CSV format with re
try suppor
t at
the test level
."
"Outputs results in JSON/CSV format with re
ques
t
r
at
e limiting and retry support
."
),
"legacy"
:
(
"Legacy custom client: Direct HTTP request loop with per-request retry logic. "
"Logs results in JSONL format with basic latency and status tracking. "
"Includes rate limiting and round-robin pod selection."
"Includes
request
rate limiting and round-robin pod selection."
),
}
...
...
tests/fault_tolerance/deploy/test_deployment.py
View file @
c18b4758
...
...
@@ -87,13 +87,8 @@ def _clients(
procs
:
list
[
SpawnProcess
]
=
[]
ctx
=
multiprocessing
.
get_context
(
"spawn"
)
# Determine retry_delay_or_rate based on client type
if
load_config
.
client_type
==
"legacy"
:
# Legacy client uses max_request_rate for rate limiting
retry_delay_or_rate
=
load_config
.
max_request_rate
else
:
# AI-Perf client uses retry_delay between attempts (default 5s)
retry_delay_or_rate
=
5
# Both client types use max_request_rate for rate limiting (requests/sec)
max_request_rate
=
load_config
.
max_request_rate
# Check if this is a continuous load test (rolling upgrade scenarios)
continuous_load
=
getattr
(
load_config
,
"continuous_load"
,
False
)
...
...
@@ -122,7 +117,7 @@ def _clients(
load_config
.
overflow_token_length
,
# 2x max_seq_len tokens
load_config
.
output_token_length
,
load_config
.
max_retries
,
retry_delay_or
_rate
,
max_request
_rate
,
continuous_load
,
),
)
...
...
@@ -151,7 +146,7 @@ def _clients(
load_config
.
input_token_length
,
# Normal token count
load_config
.
output_token_length
,
load_config
.
max_retries
,
retry_delay_or
_rate
,
max_request
_rate
,
),
)
proc_normal
.
start
()
...
...
@@ -176,7 +171,7 @@ def _clients(
load_config
.
input_token_length
,
load_config
.
output_token_length
,
load_config
.
max_retries
,
retry_delay_or
_rate
,
max_request
_rate
,
continuous_load
,
# Pass continuous_load flag
),
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment