Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
359765d3
Unverified
Commit
359765d3
authored
Feb 14, 2026
by
Hongkuan Zhou
Committed by
GitHub
Feb 14, 2026
Browse files
feat: load-based scaling in SLA Planner (#6145)
Signed-off-by:
hongkuanz
<
hongkuanz@nvidia.com
>
parent
815b1291
Changes
27
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
879 additions
and
31 deletions
+879
-31
tests/planner/scaling/disagg_planner_throughput.yaml
tests/planner/scaling/disagg_planner_throughput.yaml
+71
-0
tests/planner/scaling/run_scaling_test.sh
tests/planner/scaling/run_scaling_test.sh
+28
-6
tests/planner/test_replica_calculation.py
tests/planner/test_replica_calculation.py
+6
-6
tests/planner/test_scaling_e2e.py
tests/planner/test_scaling_e2e.py
+23
-7
tests/planner/unit/test_load_based_scaling.py
tests/planner/unit/test_load_based_scaling.py
+736
-0
tests/planner/unit/test_sla_planner_scaling.py
tests/planner/unit/test_sla_planner_scaling.py
+6
-9
tests/planner/utils/load_generator.py
tests/planner/utils/load_generator.py
+9
-3
No files found.
tests/planner/scaling/disagg_planner_throughput.yaml
0 → 100644
View file @
359765d3
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
vllm-disagg-planner
spec
:
services
:
Frontend
:
componentType
:
frontend
replicas
:
1
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
Planner
:
componentType
:
planner
replicas
:
1
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/components/src/dynamo/planner
command
:
-
python3
-
-m
-
planner_sla
args
:
-
--environment=kubernetes
-
--backend=vllm
-
--adjustment-interval=60
-
--profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
-
--no-correction
VllmDecodeWorker
:
envFromSecret
:
hf-token-secret
componentType
:
worker
subComponentType
:
decode
replicas
:
1
resources
:
limits
:
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/examples/backends/vllm
command
:
-
python3
args
:
-
-m
-
dynamo.vllm
-
--model
-
nvidia/Llama-3.1-8B-Instruct-FP8
VllmPrefillWorker
:
envFromSecret
:
hf-token-secret
componentType
:
worker
subComponentType
:
prefill
replicas
:
1
resources
:
limits
:
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/examples/backends/vllm
command
:
-
python3
args
:
-
-m
-
dynamo.vllm
-
--model
-
nvidia/Llama-3.1-8B-Instruct-FP8
-
--is-prefill-worker
tests/planner/scaling/run_scaling_test.sh
View file @
359765d3
...
@@ -7,20 +7,24 @@
...
@@ -7,20 +7,24 @@
# 1. Deploys the disaggregated planner if not already running
# 1. Deploys the disaggregated planner if not already running
# 2. Sets up port forwarding to localhost:8000
# 2. Sets up port forwarding to localhost:8000
# 3. Waits for the deployment to be ready
# 3. Waits for the deployment to be ready
# 4. Runs the
hardcoded
scaling test (
12
req/s ->
24
req/s)
# 4. Runs the scaling test (
8
req/s ->
18
req/s)
# 5. Cleans up
# 5. Cleans up
#
# Supports two modes:
# --mode throughput (default) Uses throughput-based planner
# --mode load Uses load-based planner with regression scaling
set
-e
set
-e
# Configuration
# Configuration
NAMESPACE
=
${
NAMESPACE
:-
default
}
NAMESPACE
=
${
NAMESPACE
:-
default
}
SCRIPT_DIR
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
SCRIPT_DIR
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
YAML_FILE
=
"
$SCRIPT_DIR
/disagg_planner.yaml"
TEST_FILE
=
"
$SCRIPT_DIR
/../test_scaling_e2e.py"
TEST_FILE
=
"
$SCRIPT_DIR
/../test_scaling_e2e.py"
FRONTEND_PORT
=
8000
FRONTEND_PORT
=
8000
LOCAL_PORT
=
8000
LOCAL_PORT
=
8000
DEPLOYMENT_NAME
=
"vllm-disagg-planner"
DEPLOYMENT_NAME
=
"vllm-disagg-planner"
SAVE_RESULTS
=
false
SAVE_RESULTS
=
false
MODE
=
"throughput"
# Colors for output
# Colors for output
RED
=
'\033[0;31m'
RED
=
'\033[0;31m'
...
@@ -198,14 +202,14 @@ cleanup_deployment() {
...
@@ -198,14 +202,14 @@ cleanup_deployment() {
}
}
run_test
()
{
run_test
()
{
log_info
"Running scaling test (graduated 8->18 req/s)..."
log_info
"Running scaling test (graduated 8->18 req/s
, mode=
$MODE
)..."
local
python_cmd
=
"python3"
local
python_cmd
=
"python3"
if
!
command
-v
python3 &> /dev/null
;
then
if
!
command
-v
python3 &> /dev/null
;
then
python_cmd
=
"python"
python_cmd
=
"python"
fi
fi
local
test_args
=
"--namespace
$NAMESPACE
"
local
test_args
=
"--namespace
$NAMESPACE
--mode
$MODE
"
if
[
"
$SAVE_RESULTS
"
=
true
]
;
then
if
[
"
$SAVE_RESULTS
"
=
true
]
;
then
test_args
=
"
$test_args
--save-results"
test_args
=
"
$test_args
--save-results"
log_info
"Results will be saved to tests/planner/e2e_scaling_results"
log_info
"Results will be saved to tests/planner/e2e_scaling_results"
...
@@ -227,17 +231,26 @@ main() {
...
@@ -227,17 +231,26 @@ main() {
NAMESPACE
=
"
$2
"
NAMESPACE
=
"
$2
"
shift
2
shift
2
;;
;;
--mode
)
MODE
=
"
$2
"
if
[[
"
$MODE
"
!=
"throughput"
&&
"
$MODE
"
!=
"load"
]]
;
then
log_error
"Invalid mode:
$MODE
(must be 'throughput' or 'load')"
exit
1
fi
shift
2
;;
--save-results
)
--save-results
)
SAVE_RESULTS
=
true
SAVE_RESULTS
=
true
shift
shift
;;
;;
--help
)
--help
)
echo
"Usage:
$0
[--namespace NS] [--save-results]"
echo
"Usage:
$0
[--namespace NS]
[--mode MODE]
[--save-results]"
echo
""
echo
""
echo
"Run SLA planner scaling test (graduated 8->1
5->25
req/s prefill scaling)"
echo
"Run SLA planner scaling test (graduated 8->1
8
req/s prefill scaling)"
echo
""
echo
""
echo
"Options:"
echo
"Options:"
echo
" --namespace NS Kubernetes namespace (default: default)"
echo
" --namespace NS Kubernetes namespace (default: default)"
echo
" --mode MODE Scaling mode: 'throughput' (default) or 'load'"
echo
" --save-results Save results to tests/planner/e2e_scaling_results instead of /tmp"
echo
" --save-results Save results to tests/planner/e2e_scaling_results instead of /tmp"
echo
" --help Show this help"
echo
" --help Show this help"
exit
0
exit
0
...
@@ -250,8 +263,17 @@ main() {
...
@@ -250,8 +263,17 @@ main() {
esac
esac
done
done
# Select YAML based on mode
if
[
"
$MODE
"
=
"load"
]
;
then
YAML_FILE
=
"
$SCRIPT_DIR
/disagg_planner_load.yaml"
else
YAML_FILE
=
"
$SCRIPT_DIR
/disagg_planner_throughput.yaml"
fi
log_info
"SLA Planner Scaling Test"
log_info
"SLA Planner Scaling Test"
log_info
"Namespace:
$NAMESPACE
"
log_info
"Namespace:
$NAMESPACE
"
log_info
"Mode:
$MODE
"
log_info
"YAML:
$YAML_FILE
"
log_info
"Scenario: Graduated 8->18 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"
log_info
"Scenario: Graduated 8->18 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"
check_prerequisites
check_prerequisites
...
...
tests/planner/test_replica_calculation.py
View file @
359765d3
...
@@ -16,13 +16,13 @@ from unittest.mock import Mock, patch
...
@@ -16,13 +16,13 @@ from unittest.mock import Mock, patch
import
pytest
import
pytest
from
dynamo.planner.utils.decode_planner
import
DecodePlanner
from
dynamo.planner.utils.planner_core
import
(
from
dynamo.planner.utils.planner_core
import
(
DecodePlanner
,
Metrics
,
PlannerSharedState
,
PlannerSharedState
,
PrefillPlanner
,
_apply_global_gpu_budget
,
_apply_global_gpu_budget
,
)
)
from
dynamo.planner.utils.prefill_planner
import
PrefillPlanner
from
dynamo.planner.utils.prometheus
import
Metrics
pytestmark
=
[
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
gpu_0
]
pytestmark
=
[
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
gpu_0
]
...
@@ -78,7 +78,7 @@ class PlannerHarness:
...
@@ -78,7 +78,7 @@ class PlannerHarness:
"isl_predictor"
,
"isl_predictor"
,
"osl_predictor"
,
"osl_predictor"
,
"connector"
,
"connector"
,
"prometheus_
api
_client"
,
"prometheus_
traffic
_client"
,
"args"
,
"args"
,
}
}
prefill_attrs
=
{
prefill_attrs
=
{
...
@@ -111,7 +111,7 @@ class PlannerHarness:
...
@@ -111,7 +111,7 @@ class PlannerHarness:
"isl_predictor"
,
"isl_predictor"
,
"osl_predictor"
,
"osl_predictor"
,
"connector"
,
"connector"
,
"prometheus_
api
_client"
,
"prometheus_
traffic
_client"
,
"args"
,
"args"
,
"get_workers_info"
,
"get_workers_info"
,
}
}
...
@@ -194,7 +194,7 @@ def planner():
...
@@ -194,7 +194,7 @@ def planner():
planner
.
connector
=
Mock
()
planner
.
connector
=
Mock
()
# Mock prometheus client
# Mock prometheus client
planner
.
prometheus_
api
_client
=
Mock
()
planner
.
prometheus_
traffic
_client
=
Mock
()
# Set up some baseline correction factors
# Set up some baseline correction factors
planner
.
p_correction_factor
=
1.0
planner
.
p_correction_factor
=
1.0
...
...
tests/planner/test_scaling_e2e.py
View file @
359765d3
...
@@ -261,10 +261,12 @@ class ScalingE2ETest:
...
@@ -261,10 +261,12 @@ class ScalingE2ETest:
namespace
:
str
=
"default"
,
namespace
:
str
=
"default"
,
base_url
:
str
=
"http://localhost:8000"
,
base_url
:
str
=
"http://localhost:8000"
,
save_results
:
bool
=
False
,
save_results
:
bool
=
False
,
mode
:
str
=
"throughput"
,
):
):
self
.
namespace
=
namespace
self
.
namespace
=
namespace
self
.
base_url
=
base_url
self
.
base_url
=
base_url
self
.
save_results
=
save_results
self
.
save_results
=
save_results
self
.
mode
=
mode
self
.
k8s_monitor
=
KubernetesMonitor
(
namespace
)
self
.
k8s_monitor
=
KubernetesMonitor
(
namespace
)
self
.
load_generator
=
LoadGenerator
(
self
.
load_generator
=
LoadGenerator
(
...
@@ -281,7 +283,7 @@ class ScalingE2ETest:
...
@@ -281,7 +283,7 @@ class ScalingE2ETest:
- Phase 1 (8 req/s): Should maintain 1P1D
- Phase 1 (8 req/s): Should maintain 1P1D
- Phase 2 (18 req/s): Should scale to 2P1D
- Phase 2 (18 req/s): Should scale to 2P1D
"""
"""
logger
.
info
(
"Starting scaling integration test"
)
logger
.
info
(
f
"Starting scaling integration test
(mode=
{
self
.
mode
}
)
"
)
test_start_time
=
time
.
time
()
test_start_time
=
time
.
time
()
...
@@ -291,8 +293,12 @@ class ScalingE2ETest:
...
@@ -291,8 +293,12 @@ class ScalingE2ETest:
# Start background monitoring
# Start background monitoring
# Calculate based on actual phases from load generator
# Calculate based on actual phases from load generator
# Phase durations: baseline(90s) + transition(30s) + trigger(120s) + buffer
if
self
.
mode
==
"load"
:
total_test_duration
=
90
+
30
+
120
+
BUFFER_DURATION
# Load-based: baseline(120s) + transition(30s) + trigger(120s) + buffer
total_test_duration
=
120
+
30
+
120
+
BUFFER_DURATION
else
:
# Throughput: baseline(90s) + transition(30s) + trigger(120s) + buffer
total_test_duration
=
90
+
30
+
120
+
BUFFER_DURATION
monitoring_task
=
asyncio
.
create_task
(
monitoring_task
=
asyncio
.
create_task
(
self
.
k8s_monitor
.
monitor_scaling
(
self
.
k8s_monitor
.
monitor_scaling
(
total_test_duration
,
interval
=
MONITORING_INTERVAL
total_test_duration
,
interval
=
MONITORING_INTERVAL
...
@@ -305,8 +311,10 @@ class ScalingE2ETest:
...
@@ -305,8 +311,10 @@ class ScalingE2ETest:
try
:
try
:
# Use the load generator's built-in scaling test
# Use the load generator's built-in scaling test
logger
.
info
(
"Running scaling scenario (8 req/s -> 18 req/s)"
)
logger
.
info
(
load_results
=
await
self
.
load_generator
.
run_scaling_test
()
f
"Running scaling scenario (8 req/s -> 18 req/s, mode=
{
self
.
mode
}
)"
)
load_results
=
await
self
.
load_generator
.
run_scaling_test
(
mode
=
self
.
mode
)
# Extract load results for analysis (2-phase structure)
# Extract load results for analysis (2-phase structure)
phase_results
=
load_results
.
get
(
"phase_results"
,
{})
phase_results
=
load_results
.
get
(
"phase_results"
,
{})
...
@@ -475,12 +483,20 @@ async def main():
...
@@ -475,12 +483,20 @@ async def main():
action
=
"store_true"
,
action
=
"store_true"
,
help
=
"Save results to tests/planner/e2e_scaling_results instead of /tmp"
,
help
=
"Save results to tests/planner/e2e_scaling_results instead of /tmp"
,
)
)
# No additional arguments needed - test is hardcoded
parser
.
add_argument
(
"--mode"
,
choices
=
[
"throughput"
,
"load"
],
default
=
"throughput"
,
help
=
"Scaling mode to test: throughput (default) or load"
,
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
test
=
ScalingE2ETest
(
test
=
ScalingE2ETest
(
namespace
=
args
.
namespace
,
base_url
=
args
.
base_url
,
save_results
=
args
.
save_results
namespace
=
args
.
namespace
,
base_url
=
args
.
base_url
,
save_results
=
args
.
save_results
,
mode
=
args
.
mode
,
)
)
try
:
try
:
...
...
tests/planner/unit/test_load_based_scaling.py
0 → 100644
View file @
359765d3
This diff is collapsed.
Click to expand it.
tests/planner/unit/test_sla_planner_scaling.py
View file @
359765d3
...
@@ -9,13 +9,10 @@ from unittest.mock import Mock, patch
...
@@ -9,13 +9,10 @@ from unittest.mock import Mock, patch
import
pytest
import
pytest
from
dynamo.planner.utils.decode_planner
import
DecodePlanner
from
dynamo.planner.utils.exceptions
import
DeploymentValidationError
from
dynamo.planner.utils.exceptions
import
DeploymentValidationError
from
dynamo.planner.utils.planner_core
import
(
from
dynamo.planner.utils.planner_core
import
PlannerSharedState
,
_initialize_gpu_counts
DecodePlanner
,
from
dynamo.planner.utils.prefill_planner
import
PrefillPlanner
PlannerSharedState
,
PrefillPlanner
,
_initialize_gpu_counts
,
)
pytestmark
=
[
pytestmark
=
[
pytest
.
mark
.
gpu_0
,
pytest
.
mark
.
gpu_0
,
...
@@ -82,8 +79,8 @@ def _build_planners(args, prometheus_client):
...
@@ -82,8 +79,8 @@ def _build_planners(args, prometheus_client):
shared_state
=
PlannerSharedState
()
shared_state
=
PlannerSharedState
()
prefill_planner
=
PrefillPlanner
(
None
,
args
,
shared_state
=
shared_state
)
prefill_planner
=
PrefillPlanner
(
None
,
args
,
shared_state
=
shared_state
)
decode_planner
=
DecodePlanner
(
None
,
args
,
shared_state
=
shared_state
)
decode_planner
=
DecodePlanner
(
None
,
args
,
shared_state
=
shared_state
)
prefill_planner
.
prometheus_
api
_client
=
prometheus_client
prefill_planner
.
prometheus_
traffic
_client
=
prometheus_client
decode_planner
.
prometheus_
api
_client
=
prometheus_client
decode_planner
.
prometheus_
traffic
_client
=
prometheus_client
prefill_planner
.
model_name
=
"test-model"
prefill_planner
.
model_name
=
"test-model"
decode_planner
.
model_name
=
"test-model"
decode_planner
.
model_name
=
"test-model"
...
@@ -131,7 +128,7 @@ def _expected_decode(args, decode_planner, sample):
...
@@ -131,7 +128,7 @@ def _expected_decode(args, decode_planner, sample):
def
_run_interval
(
prefill_planner
,
decode_planner
,
shared_state
):
def
_run_interval
(
prefill_planner
,
decode_planner
,
shared_state
):
asyncio
.
run
(
asyncio
.
run
(
prefill_planner
.
observe_
metric
s
(
require_prefill
=
True
,
require_decode
=
True
)
prefill_planner
.
observe_
traffic_stat
s
(
require_prefill
=
True
,
require_decode
=
True
)
)
)
decode_planner
.
update_predictors_from_metrics
(
shared_state
.
last_metrics
)
decode_planner
.
update_predictors_from_metrics
(
shared_state
.
last_metrics
)
next_num_p
=
prefill_planner
.
plan_adjustment
()
next_num_p
=
prefill_planner
.
plan_adjustment
()
...
...
tests/planner/utils/load_generator.py
View file @
359765d3
...
@@ -230,7 +230,7 @@ class LoadGenerator:
...
@@ -230,7 +230,7 @@ class LoadGenerator:
logger
.
warning
(
f
"Failed to parse aiperf results:
{
e
}
"
)
logger
.
warning
(
f
"Failed to parse aiperf results:
{
e
}
"
)
return
{}
return
{}
async
def
run_scaling_test
(
self
)
->
Dict
[
str
,
Any
]:
async
def
run_scaling_test
(
self
,
mode
:
str
=
"throughput"
)
->
Dict
[
str
,
Any
]:
"""
"""
Run a graduated scaling test for prefill scaling.
Run a graduated scaling test for prefill scaling.
...
@@ -238,17 +238,23 @@ class LoadGenerator:
...
@@ -238,17 +238,23 @@ class LoadGenerator:
- Phase 1: 8 req/s (baseline, should maintain 1P1D)
- Phase 1: 8 req/s (baseline, should maintain 1P1D)
- Phase 2: 18 req/s (should trigger prefill scaling to 2P1D)
- Phase 2: 18 req/s (should trigger prefill scaling to 2P1D)
Args:
mode: Scaling mode - "throughput" or "load".
"load" uses a longer baseline for regression warmup.
Returns:
Returns:
Dictionary with complete test results
Dictionary with complete test results
"""
"""
logger
.
info
(
logger
.
info
(
"Starting graduated prefill scaling test scenario (targeting 1P1D -> 2P1D)"
f
"Starting graduated prefill scaling test scenario (targeting 1P1D -> 2P1D
, mode=
{
mode
}
)"
)
)
logger
.
info
(
"Using conservative graduated approach with metric generation"
)
logger
.
info
(
"Using conservative graduated approach with metric generation"
)
# Graduated test parameters (optimized for prefill scaling)
# Graduated test parameters (optimized for prefill scaling)
# Load-based scaling needs longer baseline for regression warmup
baseline_duration
=
120
if
mode
==
"load"
else
90
phases
:
List
[
Dict
[
str
,
Any
]]
=
[
phases
:
List
[
Dict
[
str
,
Any
]]
=
[
{
"rate"
:
8.0
,
"duration"
:
90
,
"name"
:
"baseline"
},
{
"rate"
:
8.0
,
"duration"
:
baseline_duration
,
"name"
:
"baseline"
},
{
"rate"
:
18.0
,
"duration"
:
120
,
"name"
:
"prefill_scaling_trigger"
},
{
"rate"
:
18.0
,
"duration"
:
120
,
"name"
:
"prefill_scaling_trigger"
},
]
]
transition_delay
=
30
transition_delay
=
30
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment