Unverified Commit 359765d3 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: load-based scaling in SLA Planner (#6145)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent 815b1291
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-disagg-planner
spec:
services:
Frontend:
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
Planner:
componentType: planner
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/src/dynamo/planner
command:
- python3
- -m
- planner_sla
args:
- --environment=kubernetes
- --backend=vllm
- --adjustment-interval=60
- --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
- --no-correction
VllmDecodeWorker:
envFromSecret: hf-token-secret
componentType: worker
subComponentType: decode
replicas: 1
resources:
limits:
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
command:
- python3
args:
- -m
- dynamo.vllm
- --model
- nvidia/Llama-3.1-8B-Instruct-FP8
VllmPrefillWorker:
envFromSecret: hf-token-secret
componentType: worker
subComponentType: prefill
replicas: 1
resources:
limits:
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
command:
- python3
args:
- -m
- dynamo.vllm
- --model
- nvidia/Llama-3.1-8B-Instruct-FP8
- --is-prefill-worker
......@@ -7,20 +7,24 @@
# 1. Deploys the disaggregated planner if not already running
# 2. Sets up port forwarding to localhost:8000
# 3. Waits for the deployment to be ready
# 4. Runs the hardcoded scaling test (12 req/s -> 24 req/s)
# 4. Runs the scaling test (8 req/s -> 18 req/s)
# 5. Cleans up
#
# Supports two modes:
# --mode throughput (default) Uses throughput-based planner
# --mode load Uses load-based planner with regression scaling
set -e
# Configuration
NAMESPACE=${NAMESPACE:-default}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
YAML_FILE="$SCRIPT_DIR/disagg_planner.yaml"
TEST_FILE="$SCRIPT_DIR/../test_scaling_e2e.py"
FRONTEND_PORT=8000
LOCAL_PORT=8000
DEPLOYMENT_NAME="vllm-disagg-planner"
SAVE_RESULTS=false
MODE="throughput"
# Colors for output
RED='\033[0;31m'
......@@ -198,14 +202,14 @@ cleanup_deployment() {
}
run_test() {
log_info "Running scaling test (graduated 8->18 req/s)..."
log_info "Running scaling test (graduated 8->18 req/s, mode=$MODE)..."
local python_cmd="python3"
if ! command -v python3 &> /dev/null; then
python_cmd="python"
fi
local test_args="--namespace $NAMESPACE"
local test_args="--namespace $NAMESPACE --mode $MODE"
if [ "$SAVE_RESULTS" = true ]; then
test_args="$test_args --save-results"
log_info "Results will be saved to tests/planner/e2e_scaling_results"
......@@ -227,17 +231,26 @@ main() {
NAMESPACE="$2"
shift 2
;;
--mode)
MODE="$2"
if [[ "$MODE" != "throughput" && "$MODE" != "load" ]]; then
log_error "Invalid mode: $MODE (must be 'throughput' or 'load')"
exit 1
fi
shift 2
;;
--save-results)
SAVE_RESULTS=true
shift
;;
--help)
echo "Usage: $0 [--namespace NS] [--save-results]"
echo "Usage: $0 [--namespace NS] [--mode MODE] [--save-results]"
echo ""
echo "Run SLA planner scaling test (graduated 8->15->25 req/s prefill scaling)"
echo "Run SLA planner scaling test (graduated 8->18 req/s prefill scaling)"
echo ""
echo "Options:"
echo " --namespace NS Kubernetes namespace (default: default)"
echo " --mode MODE Scaling mode: 'throughput' (default) or 'load'"
echo " --save-results Save results to tests/planner/e2e_scaling_results instead of /tmp"
echo " --help Show this help"
exit 0
......@@ -250,8 +263,17 @@ main() {
esac
done
# Select YAML based on mode
if [ "$MODE" = "load" ]; then
YAML_FILE="$SCRIPT_DIR/disagg_planner_load.yaml"
else
YAML_FILE="$SCRIPT_DIR/disagg_planner_throughput.yaml"
fi
log_info "SLA Planner Scaling Test"
log_info "Namespace: $NAMESPACE"
log_info "Mode: $MODE"
log_info "YAML: $YAML_FILE"
log_info "Scenario: Graduated 8->18 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"
check_prerequisites
......
......@@ -16,13 +16,13 @@ from unittest.mock import Mock, patch
import pytest
from dynamo.planner.utils.decode_planner import DecodePlanner
from dynamo.planner.utils.planner_core import (
DecodePlanner,
Metrics,
PlannerSharedState,
PrefillPlanner,
_apply_global_gpu_budget,
)
from dynamo.planner.utils.prefill_planner import PrefillPlanner
from dynamo.planner.utils.prometheus import Metrics
pytestmark = [pytest.mark.pre_merge, pytest.mark.gpu_0]
......@@ -78,7 +78,7 @@ class PlannerHarness:
"isl_predictor",
"osl_predictor",
"connector",
"prometheus_api_client",
"prometheus_traffic_client",
"args",
}
prefill_attrs = {
......@@ -111,7 +111,7 @@ class PlannerHarness:
"isl_predictor",
"osl_predictor",
"connector",
"prometheus_api_client",
"prometheus_traffic_client",
"args",
"get_workers_info",
}
......@@ -194,7 +194,7 @@ def planner():
planner.connector = Mock()
# Mock prometheus client
planner.prometheus_api_client = Mock()
planner.prometheus_traffic_client = Mock()
# Set up some baseline correction factors
planner.p_correction_factor = 1.0
......
......@@ -261,10 +261,12 @@ class ScalingE2ETest:
namespace: str = "default",
base_url: str = "http://localhost:8000",
save_results: bool = False,
mode: str = "throughput",
):
self.namespace = namespace
self.base_url = base_url
self.save_results = save_results
self.mode = mode
self.k8s_monitor = KubernetesMonitor(namespace)
self.load_generator = LoadGenerator(
......@@ -281,7 +283,7 @@ class ScalingE2ETest:
- Phase 1 (8 req/s): Should maintain 1P1D
- Phase 2 (18 req/s): Should scale to 2P1D
"""
logger.info("Starting scaling integration test")
logger.info(f"Starting scaling integration test (mode={self.mode})")
test_start_time = time.time()
......@@ -291,8 +293,12 @@ class ScalingE2ETest:
# Start background monitoring
# Calculate based on actual phases from load generator
# Phase durations: baseline(90s) + transition(30s) + trigger(120s) + buffer
total_test_duration = 90 + 30 + 120 + BUFFER_DURATION
if self.mode == "load":
# Load-based: baseline(120s) + transition(30s) + trigger(120s) + buffer
total_test_duration = 120 + 30 + 120 + BUFFER_DURATION
else:
# Throughput: baseline(90s) + transition(30s) + trigger(120s) + buffer
total_test_duration = 90 + 30 + 120 + BUFFER_DURATION
monitoring_task = asyncio.create_task(
self.k8s_monitor.monitor_scaling(
total_test_duration, interval=MONITORING_INTERVAL
......@@ -305,8 +311,10 @@ class ScalingE2ETest:
try:
# Use the load generator's built-in scaling test
logger.info("Running scaling scenario (8 req/s -> 18 req/s)")
load_results = await self.load_generator.run_scaling_test()
logger.info(
f"Running scaling scenario (8 req/s -> 18 req/s, mode={self.mode})"
)
load_results = await self.load_generator.run_scaling_test(mode=self.mode)
# Extract load results for analysis (2-phase structure)
phase_results = load_results.get("phase_results", {})
......@@ -475,12 +483,20 @@ async def main():
action="store_true",
help="Save results to tests/planner/e2e_scaling_results instead of /tmp",
)
# No additional arguments needed - test is hardcoded
parser.add_argument(
"--mode",
choices=["throughput", "load"],
default="throughput",
help="Scaling mode to test: throughput (default) or load",
)
args = parser.parse_args()
test = ScalingE2ETest(
namespace=args.namespace, base_url=args.base_url, save_results=args.save_results
namespace=args.namespace,
base_url=args.base_url,
save_results=args.save_results,
mode=args.mode,
)
try:
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import argparse
import os
from unittest.mock import Mock, patch
import pytest
from dynamo.planner.utils.decode_planner import DecodePlanner
from dynamo.planner.utils.load_based_regression import LoadBasedRegressionModel
from dynamo.planner.utils.planner_argparse import validate_sla_planner_args
from dynamo.planner.utils.planner_core import PlannerSharedState
from dynamo.planner.utils.prefill_planner import PrefillPlanner
from dynamo.planner.utils.prometheus import CachedLoadMetrics, DirectRouterMetricsClient
pytestmark = [
pytest.mark.gpu_0,
pytest.mark.pre_merge,
pytest.mark.unit,
pytest.mark.planner,
]
# ── LoadBasedRegressionModel tests ──────────────────────────────────────
class TestLoadBasedRegressionModel:
def test_insufficient_data(self):
model = LoadBasedRegressionModel(window_size=50, min_observations=5)
assert not model.has_sufficient_data()
assert model.predict_x_from_sla(100.0) is None
def test_basic_linear_prediction(self):
model = LoadBasedRegressionModel(window_size=50, min_observations=3)
# y = 2x + 10: x in [1..5], y in [12..20]
for x in range(1, 6):
model.add_observation(float(x), 2.0 * x + 10.0)
assert model.has_sufficient_data()
# Reverse: x = (y - 10) / 2, y=100 => x=45
result = model.predict_x_from_sla(100.0)
assert result is not None
assert abs(result - 45.0) < 0.5
def test_negative_slope_fallback_points_below_sla(self):
model = LoadBasedRegressionModel(window_size=50, min_observations=3)
# Negative slope: higher x => lower y
# x=1 -> y=98, x=2 -> y=96, x=3 -> y=94, x=4 -> y=92, x=5 -> y=90
for x in range(1, 6):
model.add_observation(float(x), 100.0 - 2.0 * x)
# target_y=95 => points below: x=3(y=94), x=4(y=92), x=5(y=90)
# min x among those is 3
result = model.predict_x_from_sla(95.0)
assert result is not None
assert abs(result - 3.0) < 0.01
def test_negative_slope_fallback_all_above_sla(self):
model = LoadBasedRegressionModel(window_size=50, min_observations=3)
# Negative slope: x=1 -> y=98, x=2 -> y=96, ..., x=5 -> y=90
for x in range(1, 6):
model.add_observation(float(x), 100.0 - 2.0 * x)
# target_y=50 => all points have y >= 90 > 50, none below
# fallback returns smallest x overall = 1
result = model.predict_x_from_sla(50.0)
assert result is not None
assert abs(result - 1.0) < 0.01
def test_sliding_window_evicts_old(self):
model = LoadBasedRegressionModel(window_size=5, min_observations=3)
# Add 10 observations; only last 5 should remain
for i in range(10):
model.add_observation(float(i), float(i) * 2)
assert model.num_observations == 5
def test_result_clamped_to_non_negative(self):
model = LoadBasedRegressionModel(window_size=50, min_observations=3)
# y = 10x + 100: intercept=100, slope=10
for x in range(1, 6):
model.add_observation(float(x), 10.0 * x + 100.0)
# target_y=5 => x = (5-100)/10 = -9.5 => clamped to 0
result = model.predict_x_from_sla(5.0)
assert result == 0.0
def test_slope_and_intercept_properties(self):
model = LoadBasedRegressionModel(window_size=50, min_observations=3)
for x in range(1, 6):
model.add_observation(float(x), 3.0 * x + 5.0)
assert model.slope is not None
assert abs(model.slope - 3.0) < 0.01
assert model.intercept is not None
assert abs(model.intercept - 5.0) < 0.01
# ── DirectRouterMetricsClient tests ─────────────────────────────────────
class TestDirectRouterMetricsClient:
def test_parse_prometheus_text_basic(self):
"""Metrics with dynamo_namespace/model labels are grouped by worker_type."""
client = DirectRouterMetricsClient("http://localhost:8000/metrics", "test-ns")
text = (
"# HELP dynamo_frontend_worker_active_prefill_tokens Active prefill tokens\n"
"# TYPE dynamo_frontend_worker_active_prefill_tokens gauge\n"
'dynamo_frontend_worker_active_prefill_tokens{dynamo_namespace="test-ns",model="TestModel",worker_type="prefill",worker_id="w1"} 1234\n'
'dynamo_frontend_worker_active_decode_blocks{dynamo_namespace="test-ns",model="TestModel",worker_type="decode",worker_id="w2"} 56\n'
'dynamo_frontend_worker_last_time_to_first_token_seconds{dynamo_namespace="test-ns",model="TestModel",worker_type="prefill",worker_id="w1"} 0.25\n'
'dynamo_frontend_worker_last_input_sequence_tokens{dynamo_namespace="test-ns",model="TestModel",worker_type="prefill",worker_id="w1"} 3000\n'
'dynamo_frontend_worker_last_inter_token_latency_seconds{dynamo_namespace="test-ns",model="TestModel",worker_type="decode",worker_id="w2"} 0.04\n'
)
result = client._parse_prometheus_text(text)
assert "prefill" in result
assert "w1" in result["prefill"]
assert result["prefill"]["w1"]["active_prefill_tokens"] == 1234.0
assert abs(result["prefill"]["w1"]["last_ttft"] - 0.25) < 1e-6
assert result["prefill"]["w1"]["last_isl"] == 3000.0
assert "decode" in result
assert "w2" in result["decode"]
assert result["decode"]["w2"]["active_decode_blocks"] == 56.0
assert abs(result["decode"]["w2"]["last_itl"] - 0.04) < 1e-6
def test_parse_ignores_extra_labels(self):
"""Parser extracts metrics regardless of extra labels like dynamo_namespace/model."""
client = DirectRouterMetricsClient("http://localhost:8000/metrics", "ns")
text = 'dynamo_frontend_worker_active_prefill_tokens{dynamo_namespace="any-ns",model="mymodel",worker_type="prefill",worker_id="w1"} 100\n'
result = client._parse_prometheus_text(text)
assert "prefill" in result
assert "w1" in result["prefill"]
assert result["prefill"]["w1"]["active_prefill_tokens"] == 100.0
def test_get_recent_and_averaged_empty_buffer(self):
client = DirectRouterMetricsClient("http://localhost:8000/metrics", "ns")
assert client.get_recent_and_averaged_metrics("prefill") is None
def test_get_recent_and_averaged_single_sample(self):
client = DirectRouterMetricsClient("http://localhost:8000/metrics", "ns")
client._sample_buffer = [
{
"prefill": {"w1": {"active_prefill_tokens": 100.0}},
"decode": {"w2": {"active_decode_blocks": 50.0}},
}
]
result = client.get_recent_and_averaged_metrics("prefill")
assert result is not None
recent, per_worker_avg, cluster_avg = result
assert recent["w1"]["active_prefill_tokens"] == 100.0
assert per_worker_avg["w1"]["active_prefill_tokens"] == 100.0
assert cluster_avg["active_prefill_tokens"] == 100.0
# decode workers not included
assert "w2" not in recent
result_d = client.get_recent_and_averaged_metrics("decode")
assert result_d is not None
recent_d, per_worker_avg_d, cluster_avg_d = result_d
assert recent_d["w2"]["active_decode_blocks"] == 50.0
assert per_worker_avg_d["w2"]["active_decode_blocks"] == 50.0
assert cluster_avg_d["active_decode_blocks"] == 50.0
def test_get_recent_and_averaged_multiple_samples(self):
client = DirectRouterMetricsClient("http://localhost:8000/metrics", "ns")
client._sample_buffer = [
{"prefill": {"w1": {"active_prefill_tokens": 100.0}}},
{"prefill": {"w1": {"active_prefill_tokens": 200.0}}},
{"prefill": {"w1": {"active_prefill_tokens": 300.0}}},
]
result = client.get_recent_and_averaged_metrics("prefill")
assert result is not None
recent, per_worker_avg, cluster_avg = result
# Recent should be the last sample
assert abs(recent["w1"]["active_prefill_tokens"] - 300.0) < 1e-6
# Per-worker averaged over time
assert abs(per_worker_avg["w1"]["active_prefill_tokens"] - 200.0) < 1e-6
# Cluster averaged (same as per-worker when only 1 worker)
assert abs(cluster_avg["active_prefill_tokens"] - 200.0) < 1e-6
def test_parse_multiple_workers(self):
client = DirectRouterMetricsClient("http://localhost:8000/metrics", "ns")
text = (
'dynamo_frontend_worker_active_prefill_tokens{dynamo_namespace="ns",model="M",worker_type="prefill",worker_id="w1"} 100\n'
'dynamo_frontend_worker_active_prefill_tokens{dynamo_namespace="ns",model="M",worker_type="prefill",worker_id="w2"} 200\n'
)
result = client._parse_prometheus_text(text)
assert len(result.get("prefill", {})) == 2
assert result["prefill"]["w1"]["active_prefill_tokens"] == 100.0
assert result["prefill"]["w2"]["active_prefill_tokens"] == 200.0
def test_parse_rust_labels_separates_worker_types(self):
"""Rust KV router emits all metrics for all workers; parser must separate by worker_type."""
client = DirectRouterMetricsClient("http://localhost:8000/metrics", "ns")
text = (
"# HELP dynamo_frontend_worker_active_prefill_tokens Active prefill tokens\n"
"# TYPE dynamo_frontend_worker_active_prefill_tokens gauge\n"
'dynamo_frontend_worker_active_prefill_tokens{worker_id="123",dp_rank="0",worker_type="prefill"} 500\n'
'dynamo_frontend_worker_active_prefill_tokens{worker_id="456",dp_rank="0",worker_type="decode"} 0\n'
'dynamo_frontend_worker_active_decode_blocks{worker_id="123",dp_rank="0",worker_type="prefill"} 0\n'
'dynamo_frontend_worker_active_decode_blocks{worker_id="456",dp_rank="0",worker_type="decode"} 30\n'
'dynamo_frontend_worker_last_time_to_first_token_seconds{worker_id="123",dp_rank="0",worker_type="prefill"} 0.15\n'
'dynamo_frontend_worker_last_input_sequence_tokens{worker_id="123",dp_rank="0",worker_type="prefill"} 2000\n'
'dynamo_frontend_worker_last_inter_token_latency_seconds{worker_id="456",dp_rank="0",worker_type="decode"} 0.03\n'
)
result = client._parse_prometheus_text(text)
# Prefill worker 123 grouped under "prefill"
assert "prefill" in result
assert "123" in result["prefill"]
assert result["prefill"]["123"]["active_prefill_tokens"] == 500.0
assert result["prefill"]["123"]["last_ttft"] == 0.15
assert result["prefill"]["123"]["last_isl"] == 2000.0
# Decode worker 456 grouped under "decode"
assert "decode" in result
assert "456" in result["decode"]
assert result["decode"]["456"]["active_decode_blocks"] == 30.0
assert abs(result["decode"]["456"]["last_itl"] - 0.03) < 1e-6
# Cross-type metrics are stored under the correct worker_type
# (prefill worker's decode_blocks=0 stored under "prefill", not "decode")
assert "456" not in result["prefill"]
assert "123" not in result["decode"]
# ── PrefillPlanner load-based scaling tests ─────────────────────────────
@pytest.fixture(autouse=True)
def mock_prometheus_metrics():
with patch("dynamo.planner.utils.planner_core.Gauge") as mock_gauge:
mock_gauge.return_value = Mock()
yield
def _build_loadbased_args():
args = argparse.Namespace()
args.adjustment_interval = 60
args.prefill_engine_num_gpu = 1
args.decode_engine_num_gpu = 1
args.min_endpoint = 1
args.max_gpu_budget = -1
args.ttft = 500.0
args.itl = 50.0
args.backend = "vllm"
args.no_operation = True
args.no_correction = True
args.metric_pulling_prometheus_endpoint = "http://localhost:9090"
args.metric_reporting_prometheus_port = 0
args.load_predictor = "constant"
args.load_predictor_warmup_trace = None
args.profile_results_dir = os.path.join(
os.path.dirname(__file__),
"..",
"profiling_results",
"H200_TP1P_TP1D",
)
args.environment = "kubernetes"
args.namespace = "test-namespace"
args.mode = "disagg"
# Load-based scaling config
args.enable_loadbased_scaling = True
args.enable_throughput_scaling = True
args.disable_throughput_scaling = False
args.loadbased_router_metrics_url = "http://router:8000/metrics"
args.loadbased_adjustment_interval = 5
args.loadbased_learning_window = 50
args.loadbased_scaling_down_sensitivity = 80
args.loadbased_metric_samples = 10
args.loadbased_min_observations = 5
return args
def _avg(per_worker: dict[str, dict[str, float]]) -> dict[str, float]:
"""Compute flat averaged metrics from per-worker dicts (for test convenience)."""
sums: dict[str, float] = {}
counts: dict[str, int] = {}
for metrics in per_worker.values():
for k, v in metrics.items():
sums[k] = sums.get(k, 0.0) + v
counts[k] = counts.get(k, 0) + 1
return {k: sums[k] / counts[k] for k in sums}
class TestPrefillLoadBasedScaling:
def test_scale_up_all_workers_above_target(self):
"""When all workers have active_prefill_tokens above the regression target, scale up."""
args = _build_loadbased_args()
shared_state = PlannerSharedState()
shared_state.num_p_workers = 2
planner = PrefillPlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
# Feed regression data: TTFT = 0.1 * (active_prefill_tokens + ISL) + 100
# With TTFT SLA = 500ms: x_sla = (500 - 100) / 0.1 = 4000
# If ISL avg = 3000, target_active_tokens = 4000 - 3000 = 1000
for i in range(10):
x = 2000 + i * 200 # active_tokens + ISL
y = 0.1 * x + 100 # TTFT in ms
planner.ttft_regression.add_observation(x, y)
# Set per-worker metrics: all workers ABOVE target (1000)
metrics = {
"w1": {
"active_prefill_tokens": 1500.0,
"last_isl": 3000.0,
"last_ttft": 0.35,
},
"w2": {
"active_prefill_tokens": 1200.0,
"last_isl": 3000.0,
"last_ttft": 0.30,
},
}
planner.cached_load_metrics = CachedLoadMetrics(
recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
)
result = planner.loadbased_plan_adjustment()
assert result == 3 # scale up from 2 to 3
def test_scale_down_all_workers_below_boundary(self):
"""When all workers are below the scale-down boundary, scale down."""
args = _build_loadbased_args()
args.loadbased_scaling_down_sensitivity = 100 # max sensitivity
shared_state = PlannerSharedState()
shared_state.num_p_workers = 3
planner = PrefillPlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
# Feed regression: TTFT = 0.1 * x + 100
# x_sla = (500-100)/0.1 = 4000, target = 4000-3000 = 1000
# boundary = 1000 * (3-1)/3 * 1.0 = 666.67
for i in range(10):
x = 2000 + i * 200
y = 0.1 * x + 100
planner.ttft_regression.add_observation(x, y)
# All workers below boundary (666.67)
metrics = {
"w1": {
"active_prefill_tokens": 100.0,
"last_isl": 3000.0,
"last_ttft": 0.15,
},
"w2": {
"active_prefill_tokens": 200.0,
"last_isl": 3000.0,
"last_ttft": 0.16,
},
"w3": {
"active_prefill_tokens": 150.0,
"last_isl": 3000.0,
"last_ttft": 0.15,
},
}
planner.cached_load_metrics = CachedLoadMetrics(
recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
)
result = planner.loadbased_plan_adjustment()
assert result == 2 # scale down from 3 to 2
def test_no_change_mixed_workers(self):
"""When workers are mixed (some above, some below), no scaling."""
args = _build_loadbased_args()
shared_state = PlannerSharedState()
shared_state.num_p_workers = 2
planner = PrefillPlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
for i in range(10):
x = 2000 + i * 200
y = 0.1 * x + 100
planner.ttft_regression.add_observation(x, y)
# Mixed: one above target, one below
metrics = {
"w1": {
"active_prefill_tokens": 1500.0,
"last_isl": 3000.0,
"last_ttft": 0.35,
},
"w2": {
"active_prefill_tokens": 100.0,
"last_isl": 3000.0,
"last_ttft": 0.15,
},
}
planner.cached_load_metrics = CachedLoadMetrics(
recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
)
result = planner.loadbased_plan_adjustment()
assert result is None
def test_cold_start_returns_none(self):
"""With insufficient data, loadbased_plan_adjustment returns None."""
args = _build_loadbased_args()
shared_state = PlannerSharedState()
shared_state.num_p_workers = 2
planner = PrefillPlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
# Only 2 observations (min is 5)
planner.ttft_regression.add_observation(1000.0, 200.0)
planner.ttft_regression.add_observation(2000.0, 300.0)
metrics = {
"w1": {
"active_prefill_tokens": 5000.0,
"last_isl": 3000.0,
"last_ttft": 0.5,
},
}
planner.cached_load_metrics = CachedLoadMetrics(
recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
)
result = planner.loadbased_plan_adjustment()
assert result is None
class TestDecodeLoadBasedScaling:
def test_scale_up_all_workers_above_target(self):
"""When all workers have active_decode_blocks above x_sla, scale up."""
args = _build_loadbased_args()
shared_state = PlannerSharedState()
shared_state.num_d_workers = 2
planner = DecodePlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
# Feed regression: ITL = 0.5 * active_decode_blocks + 10
# x_sla = (50 - 10) / 0.5 = 80
for i in range(10):
x = 20 + i * 10
y = 0.5 * x + 10
planner.itl_regression.add_observation(x, y)
# All workers above x_sla (80)
metrics = {
"w1": {"active_decode_blocks": 100.0, "last_itl": 0.06},
"w2": {"active_decode_blocks": 95.0, "last_itl": 0.055},
}
planner.cached_load_metrics = CachedLoadMetrics(
recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
)
result = planner.loadbased_plan_adjustment()
assert result == 3
def test_scale_down_all_workers_below_boundary(self):
"""When all decode workers are below boundary, scale down."""
args = _build_loadbased_args()
args.loadbased_scaling_down_sensitivity = 100
shared_state = PlannerSharedState()
shared_state.num_d_workers = 3
planner = DecodePlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
# ITL = 0.5 * x + 10, x_sla = (50-10)/0.5 = 80
# boundary = 80 * (3-1)/3 * 1.0 = 53.33
for i in range(10):
x = 20 + i * 10
y = 0.5 * x + 10
planner.itl_regression.add_observation(x, y)
# All workers below boundary (53.33)
metrics = {
"w1": {"active_decode_blocks": 10.0, "last_itl": 0.02},
"w2": {"active_decode_blocks": 15.0, "last_itl": 0.025},
"w3": {"active_decode_blocks": 20.0, "last_itl": 0.03},
}
planner.cached_load_metrics = CachedLoadMetrics(
recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
)
result = planner.loadbased_plan_adjustment()
assert result == 2
def test_cold_start_returns_none(self):
"""Decode cold start also returns None."""
args = _build_loadbased_args()
shared_state = PlannerSharedState()
shared_state.num_d_workers = 2
planner = DecodePlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
planner.itl_regression.add_observation(10.0, 15.0)
metrics = {
"w1": {"active_decode_blocks": 200.0, "last_itl": 0.1},
}
planner.cached_load_metrics = CachedLoadMetrics(
recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
)
result = planner.loadbased_plan_adjustment()
assert result is None
class TestLowerBoundEnforcement:
def test_throughput_lower_bound_respected(self):
"""Load-based scaling should never go below throughput lower bound."""
args = _build_loadbased_args()
shared_state = PlannerSharedState()
shared_state.num_p_workers = 5
# Throughput says we need at least 4 prefill workers
shared_state.throughput_lower_bound_p = 4
planner = PrefillPlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
# Regression says we should scale down to 4 (from 5)
for i in range(10):
x = 2000 + i * 200
y = 0.1 * x + 100
planner.ttft_regression.add_observation(x, y)
# Workers all lightly loaded => wants to scale down to 4
metrics = {
f"w{i}": {
"active_prefill_tokens": 50.0,
"last_isl": 3000.0,
"last_ttft": 0.12,
}
for i in range(5)
}
planner.cached_load_metrics = CachedLoadMetrics(
recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
)
result = planner.loadbased_plan_adjustment()
# Even though load-based wants to scale down, the result should be
# at least 4 after lower bound enforcement (done in the loop, not in
# loadbased_plan_adjustment itself)
# loadbased_plan_adjustment returns raw desired value
assert result == 4 # raw value from load-based
def test_scaling_down_sensitivity_zero_never_scales_down(self):
"""With sensitivity=0, scale-down boundary is 0 so never scale down."""
args = _build_loadbased_args()
args.loadbased_scaling_down_sensitivity = 0
shared_state = PlannerSharedState()
shared_state.num_p_workers = 3
planner = PrefillPlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
for i in range(10):
x = 2000 + i * 200
y = 0.1 * x + 100
planner.ttft_regression.add_observation(x, y)
# All workers at zero load
metrics = {
f"w{i}": {
"active_prefill_tokens": 0.0,
"last_isl": 3000.0,
"last_ttft": 0.12,
}
for i in range(3)
}
planner.cached_load_metrics = CachedLoadMetrics(
recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
)
# boundary = target * (3-1)/3 * 0/100 = 0
# all workers at 0 which is NOT less than 0 (it's equal)
result = planner.loadbased_plan_adjustment()
assert result is None # no scaling happens
# ── Correction factor auto-disable tests ─────────────────────────────
class TestCorrectionFactorAutoDisable:
def test_correction_factor_disabled_when_loadbased_enabled(self):
"""Correction factor should be auto-disabled when load-based scaling is on."""
args = _build_loadbased_args()
args.no_correction = False # user didn't explicitly disable
validate_sla_planner_args(args)
assert args.no_correction is True
def test_correction_factor_stays_disabled_if_already_set(self):
"""If user already set --no-correction, no extra warning needed."""
args = _build_loadbased_args()
args.no_correction = True # user explicitly set
validate_sla_planner_args(args)
assert args.no_correction is True
def test_correction_factor_not_disabled_without_loadbased(self):
"""Without load-based scaling, correction factor should respect user setting."""
args = _build_loadbased_args()
args.enable_loadbased_scaling = False
args.no_correction = False
validate_sla_planner_args(args)
assert args.no_correction is False
# ── DGD worker count reconciliation tests ────────────────────────────
class TestWorkerCountReconciliation:
async def test_prefill_observe_gets_only_prefill_workers(self):
"""observe_engine_load_stats for prefill queries get_recent_and_averaged_metrics('prefill')."""
args = _build_loadbased_args()
shared_state = PlannerSharedState()
shared_state.num_p_workers = 1
planner = PrefillPlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
# get_recent_and_averaged_metrics("prefill") returns (recent, per_worker_avg, cluster_avg)
prefill_metrics = {
"w1": {
"active_prefill_tokens": 500.0,
"last_ttft": 0.2,
"last_isl": 3000.0,
},
}
planner.prometheus_engine_client = Mock()
planner.prometheus_engine_client.get_recent_and_averaged_metrics.return_value = (
prefill_metrics,
prefill_metrics,
_avg(prefill_metrics),
)
await planner.observe_engine_load_stats()
planner.prometheus_engine_client.get_recent_and_averaged_metrics.assert_called_once_with(
"prefill"
)
assert len(planner.cached_load_metrics.recent) == 1
assert "w1" in planner.cached_load_metrics.recent
async def test_decode_observe_gets_only_decode_workers(self):
"""observe_engine_load_stats for decode queries get_recent_and_averaged_metrics('decode')."""
args = _build_loadbased_args()
shared_state = PlannerSharedState()
shared_state.num_d_workers = 1
planner = DecodePlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
decode_metrics = {
"w2": {"active_decode_blocks": 50.0, "last_itl": 0.04},
}
planner.prometheus_engine_client = Mock()
planner.prometheus_engine_client.get_recent_and_averaged_metrics.return_value = (
decode_metrics,
decode_metrics,
_avg(decode_metrics),
)
await planner.observe_engine_load_stats()
planner.prometheus_engine_client.get_recent_and_averaged_metrics.assert_called_once_with(
"decode"
)
assert len(planner.cached_load_metrics.recent) == 1
assert "w2" in planner.cached_load_metrics.recent
def test_worker_count_mismatch_detected(self):
"""When DGD and Prometheus worker counts differ, the mismatch should be detectable."""
args = _build_loadbased_args()
shared_state = PlannerSharedState()
# DGD says 3 prefill workers
shared_state.num_p_workers = 3
planner = PrefillPlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
# But router only reports 2 prefill workers
metrics = {
"w1": {
"active_prefill_tokens": 500.0,
"last_isl": 3000.0,
"last_ttft": 0.2,
},
"w2": {
"active_prefill_tokens": 600.0,
"last_isl": 3000.0,
"last_ttft": 0.25,
},
}
planner.cached_load_metrics = CachedLoadMetrics(
recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
)
# The mismatch should be detectable by comparing counts
prom_count = len(planner.cached_load_metrics.recent)
dgd_count = shared_state.num_p_workers
assert prom_count != dgd_count
assert prom_count == 2
assert dgd_count == 3
def test_worker_count_match_allows_scaling(self):
"""When DGD and Prometheus counts match, scaling proceeds normally."""
args = _build_loadbased_args()
shared_state = PlannerSharedState()
shared_state.num_p_workers = 2
planner = PrefillPlanner(None, args, shared_state=shared_state)
planner.model_name = "test-model"
metrics = {
"w1": {
"active_prefill_tokens": 1500.0,
"last_isl": 3000.0,
"last_ttft": 0.35,
},
"w2": {
"active_prefill_tokens": 1200.0,
"last_isl": 3000.0,
"last_ttft": 0.30,
},
}
planner.cached_load_metrics = CachedLoadMetrics(
recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
)
prom_count = len(planner.cached_load_metrics.recent)
dgd_count = shared_state.num_p_workers
assert prom_count == dgd_count
# With matching counts and sufficient regression data, scaling should work
for i in range(10):
x = 2000 + i * 200
y = 0.1 * x + 100
planner.ttft_regression.add_observation(x, y)
result = planner.loadbased_plan_adjustment()
assert result is not None # scaling proceeds
......@@ -9,13 +9,10 @@ from unittest.mock import Mock, patch
import pytest
from dynamo.planner.utils.decode_planner import DecodePlanner
from dynamo.planner.utils.exceptions import DeploymentValidationError
from dynamo.planner.utils.planner_core import (
DecodePlanner,
PlannerSharedState,
PrefillPlanner,
_initialize_gpu_counts,
)
from dynamo.planner.utils.planner_core import PlannerSharedState, _initialize_gpu_counts
from dynamo.planner.utils.prefill_planner import PrefillPlanner
pytestmark = [
pytest.mark.gpu_0,
......@@ -82,8 +79,8 @@ def _build_planners(args, prometheus_client):
shared_state = PlannerSharedState()
prefill_planner = PrefillPlanner(None, args, shared_state=shared_state)
decode_planner = DecodePlanner(None, args, shared_state=shared_state)
prefill_planner.prometheus_api_client = prometheus_client
decode_planner.prometheus_api_client = prometheus_client
prefill_planner.prometheus_traffic_client = prometheus_client
decode_planner.prometheus_traffic_client = prometheus_client
prefill_planner.model_name = "test-model"
decode_planner.model_name = "test-model"
......@@ -131,7 +128,7 @@ def _expected_decode(args, decode_planner, sample):
def _run_interval(prefill_planner, decode_planner, shared_state):
asyncio.run(
prefill_planner.observe_metrics(require_prefill=True, require_decode=True)
prefill_planner.observe_traffic_stats(require_prefill=True, require_decode=True)
)
decode_planner.update_predictors_from_metrics(shared_state.last_metrics)
next_num_p = prefill_planner.plan_adjustment()
......
......@@ -230,7 +230,7 @@ class LoadGenerator:
logger.warning(f"Failed to parse aiperf results: {e}")
return {}
async def run_scaling_test(self) -> Dict[str, Any]:
async def run_scaling_test(self, mode: str = "throughput") -> Dict[str, Any]:
"""
Run a graduated scaling test for prefill scaling.
......@@ -238,17 +238,23 @@ class LoadGenerator:
- Phase 1: 8 req/s (baseline, should maintain 1P1D)
- Phase 2: 18 req/s (should trigger prefill scaling to 2P1D)
Args:
mode: Scaling mode - "throughput" or "load".
"load" uses a longer baseline for regression warmup.
Returns:
Dictionary with complete test results
"""
logger.info(
"Starting graduated prefill scaling test scenario (targeting 1P1D -> 2P1D)"
f"Starting graduated prefill scaling test scenario (targeting 1P1D -> 2P1D, mode={mode})"
)
logger.info("Using conservative graduated approach with metric generation")
# Graduated test parameters (optimized for prefill scaling)
# Load-based scaling needs longer baseline for regression warmup
baseline_duration = 120 if mode == "load" else 90
phases: List[Dict[str, Any]] = [
{"rate": 8.0, "duration": 90, "name": "baseline"},
{"rate": 8.0, "duration": baseline_duration, "name": "baseline"},
{"rate": 18.0, "duration": 120, "name": "prefill_scaling_trigger"},
]
transition_delay = 30
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment