Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
359765d3
Unverified
Commit
359765d3
authored
Feb 14, 2026
by
Hongkuan Zhou
Committed by
GitHub
Feb 14, 2026
Browse files
feat: load-based scaling in SLA Planner (#6145)
Signed-off-by:
hongkuanz
<
hongkuanz@nvidia.com
>
parent
815b1291
Changes
27
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
879 additions
and
31 deletions
+879
-31
tests/planner/scaling/disagg_planner_throughput.yaml
tests/planner/scaling/disagg_planner_throughput.yaml
+71
-0
tests/planner/scaling/run_scaling_test.sh
tests/planner/scaling/run_scaling_test.sh
+28
-6
tests/planner/test_replica_calculation.py
tests/planner/test_replica_calculation.py
+6
-6
tests/planner/test_scaling_e2e.py
tests/planner/test_scaling_e2e.py
+23
-7
tests/planner/unit/test_load_based_scaling.py
tests/planner/unit/test_load_based_scaling.py
+736
-0
tests/planner/unit/test_sla_planner_scaling.py
tests/planner/unit/test_sla_planner_scaling.py
+6
-9
tests/planner/utils/load_generator.py
tests/planner/utils/load_generator.py
+9
-3
No files found.
tests/planner/scaling/disagg_planner_throughput.yaml
0 → 100644
View file @
359765d3
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
vllm-disagg-planner
spec
:
services
:
Frontend
:
componentType
:
frontend
replicas
:
1
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
Planner
:
componentType
:
planner
replicas
:
1
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/components/src/dynamo/planner
command
:
-
python3
-
-m
-
planner_sla
args
:
-
--environment=kubernetes
-
--backend=vllm
-
--adjustment-interval=60
-
--profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
-
--no-correction
VllmDecodeWorker
:
envFromSecret
:
hf-token-secret
componentType
:
worker
subComponentType
:
decode
replicas
:
1
resources
:
limits
:
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/examples/backends/vllm
command
:
-
python3
args
:
-
-m
-
dynamo.vllm
-
--model
-
nvidia/Llama-3.1-8B-Instruct-FP8
VllmPrefillWorker
:
envFromSecret
:
hf-token-secret
componentType
:
worker
subComponentType
:
prefill
replicas
:
1
resources
:
limits
:
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/examples/backends/vllm
command
:
-
python3
args
:
-
-m
-
dynamo.vllm
-
--model
-
nvidia/Llama-3.1-8B-Instruct-FP8
-
--is-prefill-worker
tests/planner/scaling/run_scaling_test.sh
View file @
359765d3
...
...
@@ -7,20 +7,24 @@
# 1. Deploys the disaggregated planner if not already running
# 2. Sets up port forwarding to localhost:8000
# 3. Waits for the deployment to be ready
# 4. Runs the
hardcoded
scaling test (
12
req/s ->
24
req/s)
# 4. Runs the scaling test (
8
req/s ->
18
req/s)
# 5. Cleans up
#
# Supports two modes:
# --mode throughput (default) Uses throughput-based planner
# --mode load Uses load-based planner with regression scaling
set
-e
# Configuration
NAMESPACE
=
${
NAMESPACE
:-
default
}
SCRIPT_DIR
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
YAML_FILE
=
"
$SCRIPT_DIR
/disagg_planner.yaml"
TEST_FILE
=
"
$SCRIPT_DIR
/../test_scaling_e2e.py"
FRONTEND_PORT
=
8000
LOCAL_PORT
=
8000
DEPLOYMENT_NAME
=
"vllm-disagg-planner"
SAVE_RESULTS
=
false
MODE
=
"throughput"
# Colors for output
RED
=
'\033[0;31m'
...
...
@@ -198,14 +202,14 @@ cleanup_deployment() {
}
run_test
()
{
log_info
"Running scaling test (graduated 8->18 req/s)..."
log_info
"Running scaling test (graduated 8->18 req/s
, mode=
$MODE
)..."
local
python_cmd
=
"python3"
if
!
command
-v
python3 &> /dev/null
;
then
python_cmd
=
"python"
fi
local
test_args
=
"--namespace
$NAMESPACE
"
local
test_args
=
"--namespace
$NAMESPACE
--mode
$MODE
"
if
[
"
$SAVE_RESULTS
"
=
true
]
;
then
test_args
=
"
$test_args
--save-results"
log_info
"Results will be saved to tests/planner/e2e_scaling_results"
...
...
@@ -227,17 +231,26 @@ main() {
NAMESPACE
=
"
$2
"
shift
2
;;
--mode
)
MODE
=
"
$2
"
if
[[
"
$MODE
"
!=
"throughput"
&&
"
$MODE
"
!=
"load"
]]
;
then
log_error
"Invalid mode:
$MODE
(must be 'throughput' or 'load')"
exit
1
fi
shift
2
;;
--save-results
)
SAVE_RESULTS
=
true
shift
;;
--help
)
echo
"Usage:
$0
[--namespace NS] [--save-results]"
echo
"Usage:
$0
[--namespace NS]
[--mode MODE]
[--save-results]"
echo
""
echo
"Run SLA planner scaling test (graduated 8->1
5->25
req/s prefill scaling)"
echo
"Run SLA planner scaling test (graduated 8->1
8
req/s prefill scaling)"
echo
""
echo
"Options:"
echo
" --namespace NS Kubernetes namespace (default: default)"
echo
" --mode MODE Scaling mode: 'throughput' (default) or 'load'"
echo
" --save-results Save results to tests/planner/e2e_scaling_results instead of /tmp"
echo
" --help Show this help"
exit
0
...
...
@@ -250,8 +263,17 @@ main() {
esac
done
# Select YAML based on mode
if
[
"
$MODE
"
=
"load"
]
;
then
YAML_FILE
=
"
$SCRIPT_DIR
/disagg_planner_load.yaml"
else
YAML_FILE
=
"
$SCRIPT_DIR
/disagg_planner_throughput.yaml"
fi
log_info
"SLA Planner Scaling Test"
log_info
"Namespace:
$NAMESPACE
"
log_info
"Mode:
$MODE
"
log_info
"YAML:
$YAML_FILE
"
log_info
"Scenario: Graduated 8->18 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"
check_prerequisites
...
...
tests/planner/test_replica_calculation.py
View file @
359765d3
...
...
@@ -16,13 +16,13 @@ from unittest.mock import Mock, patch
import
pytest
from
dynamo.planner.utils.decode_planner
import
DecodePlanner
from
dynamo.planner.utils.planner_core
import
(
DecodePlanner
,
Metrics
,
PlannerSharedState
,
PrefillPlanner
,
_apply_global_gpu_budget
,
)
from
dynamo.planner.utils.prefill_planner
import
PrefillPlanner
from
dynamo.planner.utils.prometheus
import
Metrics
pytestmark
=
[
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
gpu_0
]
...
...
@@ -78,7 +78,7 @@ class PlannerHarness:
"isl_predictor"
,
"osl_predictor"
,
"connector"
,
"prometheus_
api
_client"
,
"prometheus_
traffic
_client"
,
"args"
,
}
prefill_attrs
=
{
...
...
@@ -111,7 +111,7 @@ class PlannerHarness:
"isl_predictor"
,
"osl_predictor"
,
"connector"
,
"prometheus_
api
_client"
,
"prometheus_
traffic
_client"
,
"args"
,
"get_workers_info"
,
}
...
...
@@ -194,7 +194,7 @@ def planner():
planner
.
connector
=
Mock
()
# Mock prometheus client
planner
.
prometheus_
api
_client
=
Mock
()
planner
.
prometheus_
traffic
_client
=
Mock
()
# Set up some baseline correction factors
planner
.
p_correction_factor
=
1.0
...
...
tests/planner/test_scaling_e2e.py
View file @
359765d3
...
...
@@ -261,10 +261,12 @@ class ScalingE2ETest:
namespace
:
str
=
"default"
,
base_url
:
str
=
"http://localhost:8000"
,
save_results
:
bool
=
False
,
mode
:
str
=
"throughput"
,
):
self
.
namespace
=
namespace
self
.
base_url
=
base_url
self
.
save_results
=
save_results
self
.
mode
=
mode
self
.
k8s_monitor
=
KubernetesMonitor
(
namespace
)
self
.
load_generator
=
LoadGenerator
(
...
...
@@ -281,7 +283,7 @@ class ScalingE2ETest:
- Phase 1 (8 req/s): Should maintain 1P1D
- Phase 2 (18 req/s): Should scale to 2P1D
"""
logger
.
info
(
"Starting scaling integration test"
)
logger
.
info
(
f
"Starting scaling integration test
(mode=
{
self
.
mode
}
)
"
)
test_start_time
=
time
.
time
()
...
...
@@ -291,7 +293,11 @@ class ScalingE2ETest:
# Start background monitoring
# Calculate based on actual phases from load generator
# Phase durations: baseline(90s) + transition(30s) + trigger(120s) + buffer
if
self
.
mode
==
"load"
:
# Load-based: baseline(120s) + transition(30s) + trigger(120s) + buffer
total_test_duration
=
120
+
30
+
120
+
BUFFER_DURATION
else
:
# Throughput: baseline(90s) + transition(30s) + trigger(120s) + buffer
total_test_duration
=
90
+
30
+
120
+
BUFFER_DURATION
monitoring_task
=
asyncio
.
create_task
(
self
.
k8s_monitor
.
monitor_scaling
(
...
...
@@ -305,8 +311,10 @@ class ScalingE2ETest:
try
:
# Use the load generator's built-in scaling test
logger
.
info
(
"Running scaling scenario (8 req/s -> 18 req/s)"
)
load_results
=
await
self
.
load_generator
.
run_scaling_test
()
logger
.
info
(
f
"Running scaling scenario (8 req/s -> 18 req/s, mode=
{
self
.
mode
}
)"
)
load_results
=
await
self
.
load_generator
.
run_scaling_test
(
mode
=
self
.
mode
)
# Extract load results for analysis (2-phase structure)
phase_results
=
load_results
.
get
(
"phase_results"
,
{})
...
...
@@ -475,12 +483,20 @@ async def main():
action
=
"store_true"
,
help
=
"Save results to tests/planner/e2e_scaling_results instead of /tmp"
,
)
# No additional arguments needed - test is hardcoded
parser
.
add_argument
(
"--mode"
,
choices
=
[
"throughput"
,
"load"
],
default
=
"throughput"
,
help
=
"Scaling mode to test: throughput (default) or load"
,
)
args
=
parser
.
parse_args
()
test
=
ScalingE2ETest
(
namespace
=
args
.
namespace
,
base_url
=
args
.
base_url
,
save_results
=
args
.
save_results
namespace
=
args
.
namespace
,
base_url
=
args
.
base_url
,
save_results
=
args
.
save_results
,
mode
=
args
.
mode
,
)
try
:
...
...
tests/planner/unit/test_load_based_scaling.py
0 → 100644
View file @
359765d3
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
os
from
unittest.mock
import
Mock
,
patch
import
pytest
from
dynamo.planner.utils.decode_planner
import
DecodePlanner
from
dynamo.planner.utils.load_based_regression
import
LoadBasedRegressionModel
from
dynamo.planner.utils.planner_argparse
import
validate_sla_planner_args
from
dynamo.planner.utils.planner_core
import
PlannerSharedState
from
dynamo.planner.utils.prefill_planner
import
PrefillPlanner
from
dynamo.planner.utils.prometheus
import
CachedLoadMetrics
,
DirectRouterMetricsClient
pytestmark
=
[
pytest
.
mark
.
gpu_0
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
unit
,
pytest
.
mark
.
planner
,
]
# ── LoadBasedRegressionModel tests ──────────────────────────────────────
class
TestLoadBasedRegressionModel
:
def
test_insufficient_data
(
self
):
model
=
LoadBasedRegressionModel
(
window_size
=
50
,
min_observations
=
5
)
assert
not
model
.
has_sufficient_data
()
assert
model
.
predict_x_from_sla
(
100.0
)
is
None
def
test_basic_linear_prediction
(
self
):
model
=
LoadBasedRegressionModel
(
window_size
=
50
,
min_observations
=
3
)
# y = 2x + 10: x in [1..5], y in [12..20]
for
x
in
range
(
1
,
6
):
model
.
add_observation
(
float
(
x
),
2.0
*
x
+
10.0
)
assert
model
.
has_sufficient_data
()
# Reverse: x = (y - 10) / 2, y=100 => x=45
result
=
model
.
predict_x_from_sla
(
100.0
)
assert
result
is
not
None
assert
abs
(
result
-
45.0
)
<
0.5
def
test_negative_slope_fallback_points_below_sla
(
self
):
model
=
LoadBasedRegressionModel
(
window_size
=
50
,
min_observations
=
3
)
# Negative slope: higher x => lower y
# x=1 -> y=98, x=2 -> y=96, x=3 -> y=94, x=4 -> y=92, x=5 -> y=90
for
x
in
range
(
1
,
6
):
model
.
add_observation
(
float
(
x
),
100.0
-
2.0
*
x
)
# target_y=95 => points below: x=3(y=94), x=4(y=92), x=5(y=90)
# min x among those is 3
result
=
model
.
predict_x_from_sla
(
95.0
)
assert
result
is
not
None
assert
abs
(
result
-
3.0
)
<
0.01
def
test_negative_slope_fallback_all_above_sla
(
self
):
model
=
LoadBasedRegressionModel
(
window_size
=
50
,
min_observations
=
3
)
# Negative slope: x=1 -> y=98, x=2 -> y=96, ..., x=5 -> y=90
for
x
in
range
(
1
,
6
):
model
.
add_observation
(
float
(
x
),
100.0
-
2.0
*
x
)
# target_y=50 => all points have y >= 90 > 50, none below
# fallback returns smallest x overall = 1
result
=
model
.
predict_x_from_sla
(
50.0
)
assert
result
is
not
None
assert
abs
(
result
-
1.0
)
<
0.01
def
test_sliding_window_evicts_old
(
self
):
model
=
LoadBasedRegressionModel
(
window_size
=
5
,
min_observations
=
3
)
# Add 10 observations; only last 5 should remain
for
i
in
range
(
10
):
model
.
add_observation
(
float
(
i
),
float
(
i
)
*
2
)
assert
model
.
num_observations
==
5
def
test_result_clamped_to_non_negative
(
self
):
model
=
LoadBasedRegressionModel
(
window_size
=
50
,
min_observations
=
3
)
# y = 10x + 100: intercept=100, slope=10
for
x
in
range
(
1
,
6
):
model
.
add_observation
(
float
(
x
),
10.0
*
x
+
100.0
)
# target_y=5 => x = (5-100)/10 = -9.5 => clamped to 0
result
=
model
.
predict_x_from_sla
(
5.0
)
assert
result
==
0.0
def
test_slope_and_intercept_properties
(
self
):
model
=
LoadBasedRegressionModel
(
window_size
=
50
,
min_observations
=
3
)
for
x
in
range
(
1
,
6
):
model
.
add_observation
(
float
(
x
),
3.0
*
x
+
5.0
)
assert
model
.
slope
is
not
None
assert
abs
(
model
.
slope
-
3.0
)
<
0.01
assert
model
.
intercept
is
not
None
assert
abs
(
model
.
intercept
-
5.0
)
<
0.01
# ── DirectRouterMetricsClient tests ─────────────────────────────────────
class
TestDirectRouterMetricsClient
:
def
test_parse_prometheus_text_basic
(
self
):
"""Metrics with dynamo_namespace/model labels are grouped by worker_type."""
client
=
DirectRouterMetricsClient
(
"http://localhost:8000/metrics"
,
"test-ns"
)
text
=
(
"# HELP dynamo_frontend_worker_active_prefill_tokens Active prefill tokens
\n
"
"# TYPE dynamo_frontend_worker_active_prefill_tokens gauge
\n
"
'dynamo_frontend_worker_active_prefill_tokens{dynamo_namespace="test-ns",model="TestModel",worker_type="prefill",worker_id="w1"} 1234
\n
'
'dynamo_frontend_worker_active_decode_blocks{dynamo_namespace="test-ns",model="TestModel",worker_type="decode",worker_id="w2"} 56
\n
'
'dynamo_frontend_worker_last_time_to_first_token_seconds{dynamo_namespace="test-ns",model="TestModel",worker_type="prefill",worker_id="w1"} 0.25
\n
'
'dynamo_frontend_worker_last_input_sequence_tokens{dynamo_namespace="test-ns",model="TestModel",worker_type="prefill",worker_id="w1"} 3000
\n
'
'dynamo_frontend_worker_last_inter_token_latency_seconds{dynamo_namespace="test-ns",model="TestModel",worker_type="decode",worker_id="w2"} 0.04
\n
'
)
result
=
client
.
_parse_prometheus_text
(
text
)
assert
"prefill"
in
result
assert
"w1"
in
result
[
"prefill"
]
assert
result
[
"prefill"
][
"w1"
][
"active_prefill_tokens"
]
==
1234.0
assert
abs
(
result
[
"prefill"
][
"w1"
][
"last_ttft"
]
-
0.25
)
<
1e-6
assert
result
[
"prefill"
][
"w1"
][
"last_isl"
]
==
3000.0
assert
"decode"
in
result
assert
"w2"
in
result
[
"decode"
]
assert
result
[
"decode"
][
"w2"
][
"active_decode_blocks"
]
==
56.0
assert
abs
(
result
[
"decode"
][
"w2"
][
"last_itl"
]
-
0.04
)
<
1e-6
def
test_parse_ignores_extra_labels
(
self
):
"""Parser extracts metrics regardless of extra labels like dynamo_namespace/model."""
client
=
DirectRouterMetricsClient
(
"http://localhost:8000/metrics"
,
"ns"
)
text
=
'dynamo_frontend_worker_active_prefill_tokens{dynamo_namespace="any-ns",model="mymodel",worker_type="prefill",worker_id="w1"} 100
\n
'
result
=
client
.
_parse_prometheus_text
(
text
)
assert
"prefill"
in
result
assert
"w1"
in
result
[
"prefill"
]
assert
result
[
"prefill"
][
"w1"
][
"active_prefill_tokens"
]
==
100.0
def
test_get_recent_and_averaged_empty_buffer
(
self
):
client
=
DirectRouterMetricsClient
(
"http://localhost:8000/metrics"
,
"ns"
)
assert
client
.
get_recent_and_averaged_metrics
(
"prefill"
)
is
None
def
test_get_recent_and_averaged_single_sample
(
self
):
client
=
DirectRouterMetricsClient
(
"http://localhost:8000/metrics"
,
"ns"
)
client
.
_sample_buffer
=
[
{
"prefill"
:
{
"w1"
:
{
"active_prefill_tokens"
:
100.0
}},
"decode"
:
{
"w2"
:
{
"active_decode_blocks"
:
50.0
}},
}
]
result
=
client
.
get_recent_and_averaged_metrics
(
"prefill"
)
assert
result
is
not
None
recent
,
per_worker_avg
,
cluster_avg
=
result
assert
recent
[
"w1"
][
"active_prefill_tokens"
]
==
100.0
assert
per_worker_avg
[
"w1"
][
"active_prefill_tokens"
]
==
100.0
assert
cluster_avg
[
"active_prefill_tokens"
]
==
100.0
# decode workers not included
assert
"w2"
not
in
recent
result_d
=
client
.
get_recent_and_averaged_metrics
(
"decode"
)
assert
result_d
is
not
None
recent_d
,
per_worker_avg_d
,
cluster_avg_d
=
result_d
assert
recent_d
[
"w2"
][
"active_decode_blocks"
]
==
50.0
assert
per_worker_avg_d
[
"w2"
][
"active_decode_blocks"
]
==
50.0
assert
cluster_avg_d
[
"active_decode_blocks"
]
==
50.0
def
test_get_recent_and_averaged_multiple_samples
(
self
):
client
=
DirectRouterMetricsClient
(
"http://localhost:8000/metrics"
,
"ns"
)
client
.
_sample_buffer
=
[
{
"prefill"
:
{
"w1"
:
{
"active_prefill_tokens"
:
100.0
}}},
{
"prefill"
:
{
"w1"
:
{
"active_prefill_tokens"
:
200.0
}}},
{
"prefill"
:
{
"w1"
:
{
"active_prefill_tokens"
:
300.0
}}},
]
result
=
client
.
get_recent_and_averaged_metrics
(
"prefill"
)
assert
result
is
not
None
recent
,
per_worker_avg
,
cluster_avg
=
result
# Recent should be the last sample
assert
abs
(
recent
[
"w1"
][
"active_prefill_tokens"
]
-
300.0
)
<
1e-6
# Per-worker averaged over time
assert
abs
(
per_worker_avg
[
"w1"
][
"active_prefill_tokens"
]
-
200.0
)
<
1e-6
# Cluster averaged (same as per-worker when only 1 worker)
assert
abs
(
cluster_avg
[
"active_prefill_tokens"
]
-
200.0
)
<
1e-6
def
test_parse_multiple_workers
(
self
):
client
=
DirectRouterMetricsClient
(
"http://localhost:8000/metrics"
,
"ns"
)
text
=
(
'dynamo_frontend_worker_active_prefill_tokens{dynamo_namespace="ns",model="M",worker_type="prefill",worker_id="w1"} 100
\n
'
'dynamo_frontend_worker_active_prefill_tokens{dynamo_namespace="ns",model="M",worker_type="prefill",worker_id="w2"} 200
\n
'
)
result
=
client
.
_parse_prometheus_text
(
text
)
assert
len
(
result
.
get
(
"prefill"
,
{}))
==
2
assert
result
[
"prefill"
][
"w1"
][
"active_prefill_tokens"
]
==
100.0
assert
result
[
"prefill"
][
"w2"
][
"active_prefill_tokens"
]
==
200.0
def
test_parse_rust_labels_separates_worker_types
(
self
):
"""Rust KV router emits all metrics for all workers; parser must separate by worker_type."""
client
=
DirectRouterMetricsClient
(
"http://localhost:8000/metrics"
,
"ns"
)
text
=
(
"# HELP dynamo_frontend_worker_active_prefill_tokens Active prefill tokens
\n
"
"# TYPE dynamo_frontend_worker_active_prefill_tokens gauge
\n
"
'dynamo_frontend_worker_active_prefill_tokens{worker_id="123",dp_rank="0",worker_type="prefill"} 500
\n
'
'dynamo_frontend_worker_active_prefill_tokens{worker_id="456",dp_rank="0",worker_type="decode"} 0
\n
'
'dynamo_frontend_worker_active_decode_blocks{worker_id="123",dp_rank="0",worker_type="prefill"} 0
\n
'
'dynamo_frontend_worker_active_decode_blocks{worker_id="456",dp_rank="0",worker_type="decode"} 30
\n
'
'dynamo_frontend_worker_last_time_to_first_token_seconds{worker_id="123",dp_rank="0",worker_type="prefill"} 0.15
\n
'
'dynamo_frontend_worker_last_input_sequence_tokens{worker_id="123",dp_rank="0",worker_type="prefill"} 2000
\n
'
'dynamo_frontend_worker_last_inter_token_latency_seconds{worker_id="456",dp_rank="0",worker_type="decode"} 0.03
\n
'
)
result
=
client
.
_parse_prometheus_text
(
text
)
# Prefill worker 123 grouped under "prefill"
assert
"prefill"
in
result
assert
"123"
in
result
[
"prefill"
]
assert
result
[
"prefill"
][
"123"
][
"active_prefill_tokens"
]
==
500.0
assert
result
[
"prefill"
][
"123"
][
"last_ttft"
]
==
0.15
assert
result
[
"prefill"
][
"123"
][
"last_isl"
]
==
2000.0
# Decode worker 456 grouped under "decode"
assert
"decode"
in
result
assert
"456"
in
result
[
"decode"
]
assert
result
[
"decode"
][
"456"
][
"active_decode_blocks"
]
==
30.0
assert
abs
(
result
[
"decode"
][
"456"
][
"last_itl"
]
-
0.03
)
<
1e-6
# Cross-type metrics are stored under the correct worker_type
# (prefill worker's decode_blocks=0 stored under "prefill", not "decode")
assert
"456"
not
in
result
[
"prefill"
]
assert
"123"
not
in
result
[
"decode"
]
# ── PrefillPlanner load-based scaling tests ─────────────────────────────
@
pytest
.
fixture
(
autouse
=
True
)
def
mock_prometheus_metrics
():
with
patch
(
"dynamo.planner.utils.planner_core.Gauge"
)
as
mock_gauge
:
mock_gauge
.
return_value
=
Mock
()
yield
def
_build_loadbased_args
():
args
=
argparse
.
Namespace
()
args
.
adjustment_interval
=
60
args
.
prefill_engine_num_gpu
=
1
args
.
decode_engine_num_gpu
=
1
args
.
min_endpoint
=
1
args
.
max_gpu_budget
=
-
1
args
.
ttft
=
500.0
args
.
itl
=
50.0
args
.
backend
=
"vllm"
args
.
no_operation
=
True
args
.
no_correction
=
True
args
.
metric_pulling_prometheus_endpoint
=
"http://localhost:9090"
args
.
metric_reporting_prometheus_port
=
0
args
.
load_predictor
=
"constant"
args
.
load_predictor_warmup_trace
=
None
args
.
profile_results_dir
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
"profiling_results"
,
"H200_TP1P_TP1D"
,
)
args
.
environment
=
"kubernetes"
args
.
namespace
=
"test-namespace"
args
.
mode
=
"disagg"
# Load-based scaling config
args
.
enable_loadbased_scaling
=
True
args
.
enable_throughput_scaling
=
True
args
.
disable_throughput_scaling
=
False
args
.
loadbased_router_metrics_url
=
"http://router:8000/metrics"
args
.
loadbased_adjustment_interval
=
5
args
.
loadbased_learning_window
=
50
args
.
loadbased_scaling_down_sensitivity
=
80
args
.
loadbased_metric_samples
=
10
args
.
loadbased_min_observations
=
5
return
args
def
_avg
(
per_worker
:
dict
[
str
,
dict
[
str
,
float
]])
->
dict
[
str
,
float
]:
"""Compute flat averaged metrics from per-worker dicts (for test convenience)."""
sums
:
dict
[
str
,
float
]
=
{}
counts
:
dict
[
str
,
int
]
=
{}
for
metrics
in
per_worker
.
values
():
for
k
,
v
in
metrics
.
items
():
sums
[
k
]
=
sums
.
get
(
k
,
0.0
)
+
v
counts
[
k
]
=
counts
.
get
(
k
,
0
)
+
1
return
{
k
:
sums
[
k
]
/
counts
[
k
]
for
k
in
sums
}
class
TestPrefillLoadBasedScaling
:
def
test_scale_up_all_workers_above_target
(
self
):
"""When all workers have active_prefill_tokens above the regression target, scale up."""
args
=
_build_loadbased_args
()
shared_state
=
PlannerSharedState
()
shared_state
.
num_p_workers
=
2
planner
=
PrefillPlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
# Feed regression data: TTFT = 0.1 * (active_prefill_tokens + ISL) + 100
# With TTFT SLA = 500ms: x_sla = (500 - 100) / 0.1 = 4000
# If ISL avg = 3000, target_active_tokens = 4000 - 3000 = 1000
for
i
in
range
(
10
):
x
=
2000
+
i
*
200
# active_tokens + ISL
y
=
0.1
*
x
+
100
# TTFT in ms
planner
.
ttft_regression
.
add_observation
(
x
,
y
)
# Set per-worker metrics: all workers ABOVE target (1000)
metrics
=
{
"w1"
:
{
"active_prefill_tokens"
:
1500.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.35
,
},
"w2"
:
{
"active_prefill_tokens"
:
1200.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.30
,
},
}
planner
.
cached_load_metrics
=
CachedLoadMetrics
(
recent
=
metrics
,
per_worker_averaged
=
metrics
,
cluster_averaged
=
_avg
(
metrics
)
)
result
=
planner
.
loadbased_plan_adjustment
()
assert
result
==
3
# scale up from 2 to 3
def
test_scale_down_all_workers_below_boundary
(
self
):
"""When all workers are below the scale-down boundary, scale down."""
args
=
_build_loadbased_args
()
args
.
loadbased_scaling_down_sensitivity
=
100
# max sensitivity
shared_state
=
PlannerSharedState
()
shared_state
.
num_p_workers
=
3
planner
=
PrefillPlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
# Feed regression: TTFT = 0.1 * x + 100
# x_sla = (500-100)/0.1 = 4000, target = 4000-3000 = 1000
# boundary = 1000 * (3-1)/3 * 1.0 = 666.67
for
i
in
range
(
10
):
x
=
2000
+
i
*
200
y
=
0.1
*
x
+
100
planner
.
ttft_regression
.
add_observation
(
x
,
y
)
# All workers below boundary (666.67)
metrics
=
{
"w1"
:
{
"active_prefill_tokens"
:
100.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.15
,
},
"w2"
:
{
"active_prefill_tokens"
:
200.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.16
,
},
"w3"
:
{
"active_prefill_tokens"
:
150.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.15
,
},
}
planner
.
cached_load_metrics
=
CachedLoadMetrics
(
recent
=
metrics
,
per_worker_averaged
=
metrics
,
cluster_averaged
=
_avg
(
metrics
)
)
result
=
planner
.
loadbased_plan_adjustment
()
assert
result
==
2
# scale down from 3 to 2
def
test_no_change_mixed_workers
(
self
):
"""When workers are mixed (some above, some below), no scaling."""
args
=
_build_loadbased_args
()
shared_state
=
PlannerSharedState
()
shared_state
.
num_p_workers
=
2
planner
=
PrefillPlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
for
i
in
range
(
10
):
x
=
2000
+
i
*
200
y
=
0.1
*
x
+
100
planner
.
ttft_regression
.
add_observation
(
x
,
y
)
# Mixed: one above target, one below
metrics
=
{
"w1"
:
{
"active_prefill_tokens"
:
1500.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.35
,
},
"w2"
:
{
"active_prefill_tokens"
:
100.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.15
,
},
}
planner
.
cached_load_metrics
=
CachedLoadMetrics
(
recent
=
metrics
,
per_worker_averaged
=
metrics
,
cluster_averaged
=
_avg
(
metrics
)
)
result
=
planner
.
loadbased_plan_adjustment
()
assert
result
is
None
def
test_cold_start_returns_none
(
self
):
"""With insufficient data, loadbased_plan_adjustment returns None."""
args
=
_build_loadbased_args
()
shared_state
=
PlannerSharedState
()
shared_state
.
num_p_workers
=
2
planner
=
PrefillPlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
# Only 2 observations (min is 5)
planner
.
ttft_regression
.
add_observation
(
1000.0
,
200.0
)
planner
.
ttft_regression
.
add_observation
(
2000.0
,
300.0
)
metrics
=
{
"w1"
:
{
"active_prefill_tokens"
:
5000.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.5
,
},
}
planner
.
cached_load_metrics
=
CachedLoadMetrics
(
recent
=
metrics
,
per_worker_averaged
=
metrics
,
cluster_averaged
=
_avg
(
metrics
)
)
result
=
planner
.
loadbased_plan_adjustment
()
assert
result
is
None
class
TestDecodeLoadBasedScaling
:
def
test_scale_up_all_workers_above_target
(
self
):
"""When all workers have active_decode_blocks above x_sla, scale up."""
args
=
_build_loadbased_args
()
shared_state
=
PlannerSharedState
()
shared_state
.
num_d_workers
=
2
planner
=
DecodePlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
# Feed regression: ITL = 0.5 * active_decode_blocks + 10
# x_sla = (50 - 10) / 0.5 = 80
for
i
in
range
(
10
):
x
=
20
+
i
*
10
y
=
0.5
*
x
+
10
planner
.
itl_regression
.
add_observation
(
x
,
y
)
# All workers above x_sla (80)
metrics
=
{
"w1"
:
{
"active_decode_blocks"
:
100.0
,
"last_itl"
:
0.06
},
"w2"
:
{
"active_decode_blocks"
:
95.0
,
"last_itl"
:
0.055
},
}
planner
.
cached_load_metrics
=
CachedLoadMetrics
(
recent
=
metrics
,
per_worker_averaged
=
metrics
,
cluster_averaged
=
_avg
(
metrics
)
)
result
=
planner
.
loadbased_plan_adjustment
()
assert
result
==
3
def
test_scale_down_all_workers_below_boundary
(
self
):
"""When all decode workers are below boundary, scale down."""
args
=
_build_loadbased_args
()
args
.
loadbased_scaling_down_sensitivity
=
100
shared_state
=
PlannerSharedState
()
shared_state
.
num_d_workers
=
3
planner
=
DecodePlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
# ITL = 0.5 * x + 10, x_sla = (50-10)/0.5 = 80
# boundary = 80 * (3-1)/3 * 1.0 = 53.33
for
i
in
range
(
10
):
x
=
20
+
i
*
10
y
=
0.5
*
x
+
10
planner
.
itl_regression
.
add_observation
(
x
,
y
)
# All workers below boundary (53.33)
metrics
=
{
"w1"
:
{
"active_decode_blocks"
:
10.0
,
"last_itl"
:
0.02
},
"w2"
:
{
"active_decode_blocks"
:
15.0
,
"last_itl"
:
0.025
},
"w3"
:
{
"active_decode_blocks"
:
20.0
,
"last_itl"
:
0.03
},
}
planner
.
cached_load_metrics
=
CachedLoadMetrics
(
recent
=
metrics
,
per_worker_averaged
=
metrics
,
cluster_averaged
=
_avg
(
metrics
)
)
result
=
planner
.
loadbased_plan_adjustment
()
assert
result
==
2
def
test_cold_start_returns_none
(
self
):
"""Decode cold start also returns None."""
args
=
_build_loadbased_args
()
shared_state
=
PlannerSharedState
()
shared_state
.
num_d_workers
=
2
planner
=
DecodePlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
planner
.
itl_regression
.
add_observation
(
10.0
,
15.0
)
metrics
=
{
"w1"
:
{
"active_decode_blocks"
:
200.0
,
"last_itl"
:
0.1
},
}
planner
.
cached_load_metrics
=
CachedLoadMetrics
(
recent
=
metrics
,
per_worker_averaged
=
metrics
,
cluster_averaged
=
_avg
(
metrics
)
)
result
=
planner
.
loadbased_plan_adjustment
()
assert
result
is
None
class
TestLowerBoundEnforcement
:
def
test_throughput_lower_bound_respected
(
self
):
"""Load-based scaling should never go below throughput lower bound."""
args
=
_build_loadbased_args
()
shared_state
=
PlannerSharedState
()
shared_state
.
num_p_workers
=
5
# Throughput says we need at least 4 prefill workers
shared_state
.
throughput_lower_bound_p
=
4
planner
=
PrefillPlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
# Regression says we should scale down to 4 (from 5)
for
i
in
range
(
10
):
x
=
2000
+
i
*
200
y
=
0.1
*
x
+
100
planner
.
ttft_regression
.
add_observation
(
x
,
y
)
# Workers all lightly loaded => wants to scale down to 4
metrics
=
{
f
"w
{
i
}
"
:
{
"active_prefill_tokens"
:
50.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.12
,
}
for
i
in
range
(
5
)
}
planner
.
cached_load_metrics
=
CachedLoadMetrics
(
recent
=
metrics
,
per_worker_averaged
=
metrics
,
cluster_averaged
=
_avg
(
metrics
)
)
result
=
planner
.
loadbased_plan_adjustment
()
# Even though load-based wants to scale down, the result should be
# at least 4 after lower bound enforcement (done in the loop, not in
# loadbased_plan_adjustment itself)
# loadbased_plan_adjustment returns raw desired value
assert
result
==
4
# raw value from load-based
def
test_scaling_down_sensitivity_zero_never_scales_down
(
self
):
"""With sensitivity=0, scale-down boundary is 0 so never scale down."""
args
=
_build_loadbased_args
()
args
.
loadbased_scaling_down_sensitivity
=
0
shared_state
=
PlannerSharedState
()
shared_state
.
num_p_workers
=
3
planner
=
PrefillPlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
for
i
in
range
(
10
):
x
=
2000
+
i
*
200
y
=
0.1
*
x
+
100
planner
.
ttft_regression
.
add_observation
(
x
,
y
)
# All workers at zero load
metrics
=
{
f
"w
{
i
}
"
:
{
"active_prefill_tokens"
:
0.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.12
,
}
for
i
in
range
(
3
)
}
planner
.
cached_load_metrics
=
CachedLoadMetrics
(
recent
=
metrics
,
per_worker_averaged
=
metrics
,
cluster_averaged
=
_avg
(
metrics
)
)
# boundary = target * (3-1)/3 * 0/100 = 0
# all workers at 0 which is NOT less than 0 (it's equal)
result
=
planner
.
loadbased_plan_adjustment
()
assert
result
is
None
# no scaling happens
# ── Correction factor auto-disable tests ─────────────────────────────
class
TestCorrectionFactorAutoDisable
:
def
test_correction_factor_disabled_when_loadbased_enabled
(
self
):
"""Correction factor should be auto-disabled when load-based scaling is on."""
args
=
_build_loadbased_args
()
args
.
no_correction
=
False
# user didn't explicitly disable
validate_sla_planner_args
(
args
)
assert
args
.
no_correction
is
True
def
test_correction_factor_stays_disabled_if_already_set
(
self
):
"""If user already set --no-correction, no extra warning needed."""
args
=
_build_loadbased_args
()
args
.
no_correction
=
True
# user explicitly set
validate_sla_planner_args
(
args
)
assert
args
.
no_correction
is
True
def
test_correction_factor_not_disabled_without_loadbased
(
self
):
"""Without load-based scaling, correction factor should respect user setting."""
args
=
_build_loadbased_args
()
args
.
enable_loadbased_scaling
=
False
args
.
no_correction
=
False
validate_sla_planner_args
(
args
)
assert
args
.
no_correction
is
False
# ── DGD worker count reconciliation tests ────────────────────────────
class
TestWorkerCountReconciliation
:
async
def
test_prefill_observe_gets_only_prefill_workers
(
self
):
"""observe_engine_load_stats for prefill queries get_recent_and_averaged_metrics('prefill')."""
args
=
_build_loadbased_args
()
shared_state
=
PlannerSharedState
()
shared_state
.
num_p_workers
=
1
planner
=
PrefillPlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
# get_recent_and_averaged_metrics("prefill") returns (recent, per_worker_avg, cluster_avg)
prefill_metrics
=
{
"w1"
:
{
"active_prefill_tokens"
:
500.0
,
"last_ttft"
:
0.2
,
"last_isl"
:
3000.0
,
},
}
planner
.
prometheus_engine_client
=
Mock
()
planner
.
prometheus_engine_client
.
get_recent_and_averaged_metrics
.
return_value
=
(
prefill_metrics
,
prefill_metrics
,
_avg
(
prefill_metrics
),
)
await
planner
.
observe_engine_load_stats
()
planner
.
prometheus_engine_client
.
get_recent_and_averaged_metrics
.
assert_called_once_with
(
"prefill"
)
assert
len
(
planner
.
cached_load_metrics
.
recent
)
==
1
assert
"w1"
in
planner
.
cached_load_metrics
.
recent
async
def
test_decode_observe_gets_only_decode_workers
(
self
):
"""observe_engine_load_stats for decode queries get_recent_and_averaged_metrics('decode')."""
args
=
_build_loadbased_args
()
shared_state
=
PlannerSharedState
()
shared_state
.
num_d_workers
=
1
planner
=
DecodePlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
decode_metrics
=
{
"w2"
:
{
"active_decode_blocks"
:
50.0
,
"last_itl"
:
0.04
},
}
planner
.
prometheus_engine_client
=
Mock
()
planner
.
prometheus_engine_client
.
get_recent_and_averaged_metrics
.
return_value
=
(
decode_metrics
,
decode_metrics
,
_avg
(
decode_metrics
),
)
await
planner
.
observe_engine_load_stats
()
planner
.
prometheus_engine_client
.
get_recent_and_averaged_metrics
.
assert_called_once_with
(
"decode"
)
assert
len
(
planner
.
cached_load_metrics
.
recent
)
==
1
assert
"w2"
in
planner
.
cached_load_metrics
.
recent
def
test_worker_count_mismatch_detected
(
self
):
"""When DGD and Prometheus worker counts differ, the mismatch should be detectable."""
args
=
_build_loadbased_args
()
shared_state
=
PlannerSharedState
()
# DGD says 3 prefill workers
shared_state
.
num_p_workers
=
3
planner
=
PrefillPlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
# But router only reports 2 prefill workers
metrics
=
{
"w1"
:
{
"active_prefill_tokens"
:
500.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.2
,
},
"w2"
:
{
"active_prefill_tokens"
:
600.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.25
,
},
}
planner
.
cached_load_metrics
=
CachedLoadMetrics
(
recent
=
metrics
,
per_worker_averaged
=
metrics
,
cluster_averaged
=
_avg
(
metrics
)
)
# The mismatch should be detectable by comparing counts
prom_count
=
len
(
planner
.
cached_load_metrics
.
recent
)
dgd_count
=
shared_state
.
num_p_workers
assert
prom_count
!=
dgd_count
assert
prom_count
==
2
assert
dgd_count
==
3
def
test_worker_count_match_allows_scaling
(
self
):
"""When DGD and Prometheus counts match, scaling proceeds normally."""
args
=
_build_loadbased_args
()
shared_state
=
PlannerSharedState
()
shared_state
.
num_p_workers
=
2
planner
=
PrefillPlanner
(
None
,
args
,
shared_state
=
shared_state
)
planner
.
model_name
=
"test-model"
metrics
=
{
"w1"
:
{
"active_prefill_tokens"
:
1500.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.35
,
},
"w2"
:
{
"active_prefill_tokens"
:
1200.0
,
"last_isl"
:
3000.0
,
"last_ttft"
:
0.30
,
},
}
planner
.
cached_load_metrics
=
CachedLoadMetrics
(
recent
=
metrics
,
per_worker_averaged
=
metrics
,
cluster_averaged
=
_avg
(
metrics
)
)
prom_count
=
len
(
planner
.
cached_load_metrics
.
recent
)
dgd_count
=
shared_state
.
num_p_workers
assert
prom_count
==
dgd_count
# With matching counts and sufficient regression data, scaling should work
for
i
in
range
(
10
):
x
=
2000
+
i
*
200
y
=
0.1
*
x
+
100
planner
.
ttft_regression
.
add_observation
(
x
,
y
)
result
=
planner
.
loadbased_plan_adjustment
()
assert
result
is
not
None
# scaling proceeds
tests/planner/unit/test_sla_planner_scaling.py
View file @
359765d3
...
...
@@ -9,13 +9,10 @@ from unittest.mock import Mock, patch
import
pytest
from
dynamo.planner.utils.decode_planner
import
DecodePlanner
from
dynamo.planner.utils.exceptions
import
DeploymentValidationError
from
dynamo.planner.utils.planner_core
import
(
DecodePlanner
,
PlannerSharedState
,
PrefillPlanner
,
_initialize_gpu_counts
,
)
from
dynamo.planner.utils.planner_core
import
PlannerSharedState
,
_initialize_gpu_counts
from
dynamo.planner.utils.prefill_planner
import
PrefillPlanner
pytestmark
=
[
pytest
.
mark
.
gpu_0
,
...
...
@@ -82,8 +79,8 @@ def _build_planners(args, prometheus_client):
shared_state
=
PlannerSharedState
()
prefill_planner
=
PrefillPlanner
(
None
,
args
,
shared_state
=
shared_state
)
decode_planner
=
DecodePlanner
(
None
,
args
,
shared_state
=
shared_state
)
prefill_planner
.
prometheus_
api
_client
=
prometheus_client
decode_planner
.
prometheus_
api
_client
=
prometheus_client
prefill_planner
.
prometheus_
traffic
_client
=
prometheus_client
decode_planner
.
prometheus_
traffic
_client
=
prometheus_client
prefill_planner
.
model_name
=
"test-model"
decode_planner
.
model_name
=
"test-model"
...
...
@@ -131,7 +128,7 @@ def _expected_decode(args, decode_planner, sample):
def
_run_interval
(
prefill_planner
,
decode_planner
,
shared_state
):
asyncio
.
run
(
prefill_planner
.
observe_
metric
s
(
require_prefill
=
True
,
require_decode
=
True
)
prefill_planner
.
observe_
traffic_stat
s
(
require_prefill
=
True
,
require_decode
=
True
)
)
decode_planner
.
update_predictors_from_metrics
(
shared_state
.
last_metrics
)
next_num_p
=
prefill_planner
.
plan_adjustment
()
...
...
tests/planner/utils/load_generator.py
View file @
359765d3
...
...
@@ -230,7 +230,7 @@ class LoadGenerator:
logger
.
warning
(
f
"Failed to parse aiperf results:
{
e
}
"
)
return
{}
async
def
run_scaling_test
(
self
)
->
Dict
[
str
,
Any
]:
async
def
run_scaling_test
(
self
,
mode
:
str
=
"throughput"
)
->
Dict
[
str
,
Any
]:
"""
Run a graduated scaling test for prefill scaling.
...
...
@@ -238,17 +238,23 @@ class LoadGenerator:
- Phase 1: 8 req/s (baseline, should maintain 1P1D)
- Phase 2: 18 req/s (should trigger prefill scaling to 2P1D)
Args:
mode: Scaling mode - "throughput" or "load".
"load" uses a longer baseline for regression warmup.
Returns:
Dictionary with complete test results
"""
logger
.
info
(
"Starting graduated prefill scaling test scenario (targeting 1P1D -> 2P1D)"
f
"Starting graduated prefill scaling test scenario (targeting 1P1D -> 2P1D
, mode=
{
mode
}
)"
)
logger
.
info
(
"Using conservative graduated approach with metric generation"
)
# Graduated test parameters (optimized for prefill scaling)
# Load-based scaling needs longer baseline for regression warmup
baseline_duration
=
120
if
mode
==
"load"
else
90
phases
:
List
[
Dict
[
str
,
Any
]]
=
[
{
"rate"
:
8.0
,
"duration"
:
90
,
"name"
:
"baseline"
},
{
"rate"
:
8.0
,
"duration"
:
baseline_duration
,
"name"
:
"baseline"
},
{
"rate"
:
18.0
,
"duration"
:
120
,
"name"
:
"prefill_scaling_trigger"
},
]
transition_delay
=
30
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment