feat: load-based scaling in SLA Planner (#6145)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

feat: load-based scaling in SLA Planner (#6145)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
359765d3 · Hongkuan Zhou · GitHub · 815b1291 · 359765d3 · 359765d3
Unverified Commit 359765d3 authored Feb 14, 2026 by Hongkuan Zhou Committed by GitHub Feb 14, 2026
7 changed files
--- a/tests/planner/scaling/disagg_planner_throughput.yaml
+++ b/tests/planner/scaling/disagg_planner_throughput.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: vllm-disagg-planner
+spec:
+  services:
+    Frontend:
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+    Planner:
+      componentType: planner
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          workingDir: /workspace/components/src/dynamo/planner
+          command:
+          - python3
+          - -m
+          - planner_sla
+          args:
+            - --environment=kubernetes
+            - --backend=vllm
+            - --adjustment-interval=60
+            - --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
+            - --no-correction
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      componentType: worker
+      subComponentType: decode
+      replicas: 1
+      resources:
+        limits:
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          workingDir: /workspace/examples/backends/vllm
+          command:
+            - python3
+          args:
+            - -m
+            - dynamo.vllm
+            - --model
+            - nvidia/Llama-3.1-8B-Instruct-FP8
+    VllmPrefillWorker:
+      envFromSecret: hf-token-secret
+      componentType: worker
+      subComponentType: prefill
+      replicas: 1
+      resources:
+        limits:
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          workingDir: /workspace/examples/backends/vllm
+          command:
+            - python3
+          args:
+            - -m
+            - dynamo.vllm
+            - --model
+            - nvidia/Llama-3.1-8B-Instruct-FP8
+            - --is-prefill-worker
--- a/tests/planner/scaling/run_scaling_test.sh
+++ b/tests/planner/scaling/run_scaling_test.sh
@@ -7,20 +7,24 @@
 # 1. Deploys the disaggregated planner if not already running
 # 2. Sets up port forwarding to localhost:8000
 # 3. Waits for the deployment to be ready
-# 4. Runs the hardcoded scaling test (12 req/s -> 24 req/s)
+# 4. Runs the scaling test (8 req/s -> 18 req/s)
 # 5. Cleans up
+#
+# Supports two modes:
+#   --mode throughput  (default) Uses throughput-based planner
+#   --mode load        Uses load-based planner with regression scaling

 set -e

 # Configuration
 NAMESPACE=${NAMESPACE:-default}
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-YAML_FILE="$SCRIPT_DIR/disagg_planner.yaml"
 TEST_FILE="$SCRIPT_DIR/../test_scaling_e2e.py"
 FRONTEND_PORT=8000
 LOCAL_PORT=8000
 DEPLOYMENT_NAME="vllm-disagg-planner"
 SAVE_RESULTS=false
+MODE="throughput"

 # Colors for output
 RED='\033[0;31m'
@@ -198,14 +202,14 @@ cleanup_deployment() {
 }

 run_test() {
-    log_info "Running scaling test (graduated 8->18 req/s)..."
+    log_info "Running scaling test (graduated 8->18 req/s, mode=$MODE)..."

    local python_cmd="python3"
    if ! command -v python3 &> /dev/null; then
        python_cmd="python"
    fi

-    local test_args="--namespace $NAMESPACE"
+    local test_args="--namespace $NAMESPACE --mode $MODE"
    if [ "$SAVE_RESULTS" = true ]; then
        test_args="$test_args --save-results"
        log_info "Results will be saved to tests/planner/e2e_scaling_results"
@@ -227,17 +231,26 @@ main() {
                NAMESPACE="$2"
                shift 2
                ;;
+            --mode)
+                MODE="$2"
+                if [[ "$MODE" != "throughput" && "$MODE" != "load" ]]; then
+                    log_error "Invalid mode: $MODE (must be 'throughput' or 'load')"
+                    exit 1
+                fi
+                shift 2
+                ;;
            --save-results)
                SAVE_RESULTS=true
                shift
                ;;
            --help)
-                echo "Usage: $0 [--namespace NS] [--save-results]"
+                echo "Usage: $0 [--namespace NS] [--mode MODE] [--save-results]"
                echo ""
-                echo "Run SLA planner scaling test (graduated 8->15->25 req/s prefill scaling)"
+                echo "Run SLA planner scaling test (graduated 8->18 req/s prefill scaling)"
                echo ""
                echo "Options:"
                echo "  --namespace NS    Kubernetes namespace (default: default)"
+                echo "  --mode MODE       Scaling mode: 'throughput' (default) or 'load'"
                echo "  --save-results    Save results to tests/planner/e2e_scaling_results instead of /tmp"
                echo "  --help            Show this help"
                exit 0
@@ -250,8 +263,17 @@ main() {
        esac
    done

+    # Select YAML based on mode
+    if [ "$MODE" = "load" ]; then
+        YAML_FILE="$SCRIPT_DIR/disagg_planner_load.yaml"
+    else
+        YAML_FILE="$SCRIPT_DIR/disagg_planner_throughput.yaml"
+    fi
+
    log_info "SLA Planner Scaling Test"
    log_info "Namespace: $NAMESPACE"
+    log_info "Mode: $MODE"
+    log_info "YAML: $YAML_FILE"
    log_info "Scenario: Graduated 8->18 req/s (1P1D -> 2P1D prefill scaling, ISL=4000/OSL=150)"

    check_prerequisites

--- a/tests/planner/test_replica_calculation.py
+++ b/tests/planner/test_replica_calculation.py
@@ -16,13 +16,13 @@ from unittest.mock import Mock, patch

 import pytest

+from dynamo.planner.utils.decode_planner import DecodePlanner
 from dynamo.planner.utils.planner_core import (
-    DecodePlanner,
-    Metrics,
    PlannerSharedState,
-    PrefillPlanner,
    _apply_global_gpu_budget,
 )
+from dynamo.planner.utils.prefill_planner import PrefillPlanner
+from dynamo.planner.utils.prometheus import Metrics

 pytestmark = [pytest.mark.pre_merge, pytest.mark.gpu_0]

@@ -78,7 +78,7 @@ class PlannerHarness:
            "isl_predictor",
            "osl_predictor",
            "connector",
-            "prometheus_api_client",
+            "prometheus_traffic_client",
            "args",
        }
        prefill_attrs = {
@@ -111,7 +111,7 @@ class PlannerHarness:
            "isl_predictor",
            "osl_predictor",
            "connector",
-            "prometheus_api_client",
+            "prometheus_traffic_client",
            "args",
            "get_workers_info",
        }
@@ -194,7 +194,7 @@ def planner():
        planner.connector = Mock()

        # Mock prometheus client
-        planner.prometheus_api_client = Mock()
+        planner.prometheus_traffic_client = Mock()

        # Set up some baseline correction factors
        planner.p_correction_factor = 1.0

--- a/tests/planner/test_scaling_e2e.py
+++ b/tests/planner/test_scaling_e2e.py
@@ -261,10 +261,12 @@ class ScalingE2ETest:
        namespace: str = "default",
        base_url: str = "http://localhost:8000",
        save_results: bool = False,
+        mode: str = "throughput",
    ):
        self.namespace = namespace
        self.base_url = base_url
        self.save_results = save_results
+        self.mode = mode

        self.k8s_monitor = KubernetesMonitor(namespace)
        self.load_generator = LoadGenerator(
@@ -281,7 +283,7 @@ class ScalingE2ETest:
        - Phase 1 (8 req/s): Should maintain 1P1D
        - Phase 2 (18 req/s): Should scale to 2P1D
        """
-        logger.info("Starting scaling integration test")
+        logger.info(f"Starting scaling integration test (mode={self.mode})")

        test_start_time = time.time()

@@ -291,7 +293,11 @@ class ScalingE2ETest:

        # Start background monitoring
        # Calculate based on actual phases from load generator
-        # Phase durations: baseline(90s) + transition(30s) + trigger(120s) + buffer
+        if self.mode == "load":
+            # Load-based: baseline(120s) + transition(30s) + trigger(120s) + buffer
+            total_test_duration = 120 + 30 + 120 + BUFFER_DURATION
+        else:
+            # Throughput: baseline(90s) + transition(30s) + trigger(120s) + buffer
            total_test_duration = 90 + 30 + 120 + BUFFER_DURATION
        monitoring_task = asyncio.create_task(
            self.k8s_monitor.monitor_scaling(
@@ -305,8 +311,10 @@ class ScalingE2ETest:

        try:
            # Use the load generator's built-in scaling test
-            logger.info("Running scaling scenario (8 req/s -> 18 req/s)")
-            load_results = await self.load_generator.run_scaling_test()
+            logger.info(
+                f"Running scaling scenario (8 req/s -> 18 req/s, mode={self.mode})"
+            )
+            load_results = await self.load_generator.run_scaling_test(mode=self.mode)

            # Extract load results for analysis (2-phase structure)
            phase_results = load_results.get("phase_results", {})
@@ -475,12 +483,20 @@ async def main():
        action="store_true",
        help="Save results to tests/planner/e2e_scaling_results instead of /tmp",
    )
-    # No additional arguments needed - test is hardcoded
+    parser.add_argument(
+        "--mode",
+        choices=["throughput", "load"],
+        default="throughput",
+        help="Scaling mode to test: throughput (default) or load",
+    )

    args = parser.parse_args()

    test = ScalingE2ETest(
-        namespace=args.namespace, base_url=args.base_url, save_results=args.save_results
+        namespace=args.namespace,
+        base_url=args.base_url,
+        save_results=args.save_results,
+        mode=args.mode,
    )

    try:

--- a/tests/planner/unit/test_load_based_scaling.py
+++ b/tests/planner/unit/test_load_based_scaling.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import os
+from unittest.mock import Mock, patch
+
+import pytest
+
+from dynamo.planner.utils.decode_planner import DecodePlanner
+from dynamo.planner.utils.load_based_regression import LoadBasedRegressionModel
+from dynamo.planner.utils.planner_argparse import validate_sla_planner_args
+from dynamo.planner.utils.planner_core import PlannerSharedState
+from dynamo.planner.utils.prefill_planner import PrefillPlanner
+from dynamo.planner.utils.prometheus import CachedLoadMetrics, DirectRouterMetricsClient
+
+pytestmark = [
+    pytest.mark.gpu_0,
+    pytest.mark.pre_merge,
+    pytest.mark.unit,
+    pytest.mark.planner,
+]
+
+
+# ── LoadBasedRegressionModel tests ──────────────────────────────────────
+
+
+class TestLoadBasedRegressionModel:
+    def test_insufficient_data(self):
+        model = LoadBasedRegressionModel(window_size=50, min_observations=5)
+        assert not model.has_sufficient_data()
+        assert model.predict_x_from_sla(100.0) is None
+
+    def test_basic_linear_prediction(self):
+        model = LoadBasedRegressionModel(window_size=50, min_observations=3)
+        # y = 2x + 10: x in [1..5], y in [12..20]
+        for x in range(1, 6):
+            model.add_observation(float(x), 2.0 * x + 10.0)
+        assert model.has_sufficient_data()
+        # Reverse: x = (y - 10) / 2, y=100 => x=45
+        result = model.predict_x_from_sla(100.0)
+        assert result is not None
+        assert abs(result - 45.0) < 0.5
+
+    def test_negative_slope_fallback_points_below_sla(self):
+        model = LoadBasedRegressionModel(window_size=50, min_observations=3)
+        # Negative slope: higher x => lower y
+        # x=1 -> y=98, x=2 -> y=96, x=3 -> y=94, x=4 -> y=92, x=5 -> y=90
+        for x in range(1, 6):
+            model.add_observation(float(x), 100.0 - 2.0 * x)
+        # target_y=95 => points below: x=3(y=94), x=4(y=92), x=5(y=90)
+        # min x among those is 3
+        result = model.predict_x_from_sla(95.0)
+        assert result is not None
+        assert abs(result - 3.0) < 0.01
+
+    def test_negative_slope_fallback_all_above_sla(self):
+        model = LoadBasedRegressionModel(window_size=50, min_observations=3)
+        # Negative slope: x=1 -> y=98, x=2 -> y=96, ..., x=5 -> y=90
+        for x in range(1, 6):
+            model.add_observation(float(x), 100.0 - 2.0 * x)
+        # target_y=50 => all points have y >= 90 > 50, none below
+        # fallback returns smallest x overall = 1
+        result = model.predict_x_from_sla(50.0)
+        assert result is not None
+        assert abs(result - 1.0) < 0.01
+
+    def test_sliding_window_evicts_old(self):
+        model = LoadBasedRegressionModel(window_size=5, min_observations=3)
+        # Add 10 observations; only last 5 should remain
+        for i in range(10):
+            model.add_observation(float(i), float(i) * 2)
+        assert model.num_observations == 5
+
+    def test_result_clamped_to_non_negative(self):
+        model = LoadBasedRegressionModel(window_size=50, min_observations=3)
+        # y = 10x + 100: intercept=100, slope=10
+        for x in range(1, 6):
+            model.add_observation(float(x), 10.0 * x + 100.0)
+        # target_y=5 => x = (5-100)/10 = -9.5 => clamped to 0
+        result = model.predict_x_from_sla(5.0)
+        assert result == 0.0
+
+    def test_slope_and_intercept_properties(self):
+        model = LoadBasedRegressionModel(window_size=50, min_observations=3)
+        for x in range(1, 6):
+            model.add_observation(float(x), 3.0 * x + 5.0)
+        assert model.slope is not None
+        assert abs(model.slope - 3.0) < 0.01
+        assert model.intercept is not None
+        assert abs(model.intercept - 5.0) < 0.01
+
+
+# ── DirectRouterMetricsClient tests ─────────────────────────────────────
+
+
+class TestDirectRouterMetricsClient:
+    def test_parse_prometheus_text_basic(self):
+        """Metrics with dynamo_namespace/model labels are grouped by worker_type."""
+        client = DirectRouterMetricsClient("http://localhost:8000/metrics", "test-ns")
+        text = (
+            "# HELP dynamo_frontend_worker_active_prefill_tokens Active prefill tokens\n"
+            "# TYPE dynamo_frontend_worker_active_prefill_tokens gauge\n"
+            'dynamo_frontend_worker_active_prefill_tokens{dynamo_namespace="test-ns",model="TestModel",worker_type="prefill",worker_id="w1"} 1234\n'
+            'dynamo_frontend_worker_active_decode_blocks{dynamo_namespace="test-ns",model="TestModel",worker_type="decode",worker_id="w2"} 56\n'
+            'dynamo_frontend_worker_last_time_to_first_token_seconds{dynamo_namespace="test-ns",model="TestModel",worker_type="prefill",worker_id="w1"} 0.25\n'
+            'dynamo_frontend_worker_last_input_sequence_tokens{dynamo_namespace="test-ns",model="TestModel",worker_type="prefill",worker_id="w1"} 3000\n'
+            'dynamo_frontend_worker_last_inter_token_latency_seconds{dynamo_namespace="test-ns",model="TestModel",worker_type="decode",worker_id="w2"} 0.04\n'
+        )
+        result = client._parse_prometheus_text(text)
+        assert "prefill" in result
+        assert "w1" in result["prefill"]
+        assert result["prefill"]["w1"]["active_prefill_tokens"] == 1234.0
+        assert abs(result["prefill"]["w1"]["last_ttft"] - 0.25) < 1e-6
+        assert result["prefill"]["w1"]["last_isl"] == 3000.0
+        assert "decode" in result
+        assert "w2" in result["decode"]
+        assert result["decode"]["w2"]["active_decode_blocks"] == 56.0
+        assert abs(result["decode"]["w2"]["last_itl"] - 0.04) < 1e-6
+
+    def test_parse_ignores_extra_labels(self):
+        """Parser extracts metrics regardless of extra labels like dynamo_namespace/model."""
+        client = DirectRouterMetricsClient("http://localhost:8000/metrics", "ns")
+        text = 'dynamo_frontend_worker_active_prefill_tokens{dynamo_namespace="any-ns",model="mymodel",worker_type="prefill",worker_id="w1"} 100\n'
+        result = client._parse_prometheus_text(text)
+        assert "prefill" in result
+        assert "w1" in result["prefill"]
+        assert result["prefill"]["w1"]["active_prefill_tokens"] == 100.0
+
+    def test_get_recent_and_averaged_empty_buffer(self):
+        client = DirectRouterMetricsClient("http://localhost:8000/metrics", "ns")
+        assert client.get_recent_and_averaged_metrics("prefill") is None
+
+    def test_get_recent_and_averaged_single_sample(self):
+        client = DirectRouterMetricsClient("http://localhost:8000/metrics", "ns")
+        client._sample_buffer = [
+            {
+                "prefill": {"w1": {"active_prefill_tokens": 100.0}},
+                "decode": {"w2": {"active_decode_blocks": 50.0}},
+            }
+        ]
+        result = client.get_recent_and_averaged_metrics("prefill")
+        assert result is not None
+        recent, per_worker_avg, cluster_avg = result
+        assert recent["w1"]["active_prefill_tokens"] == 100.0
+        assert per_worker_avg["w1"]["active_prefill_tokens"] == 100.0
+        assert cluster_avg["active_prefill_tokens"] == 100.0
+        # decode workers not included
+        assert "w2" not in recent
+
+        result_d = client.get_recent_and_averaged_metrics("decode")
+        assert result_d is not None
+        recent_d, per_worker_avg_d, cluster_avg_d = result_d
+        assert recent_d["w2"]["active_decode_blocks"] == 50.0
+        assert per_worker_avg_d["w2"]["active_decode_blocks"] == 50.0
+        assert cluster_avg_d["active_decode_blocks"] == 50.0
+
+    def test_get_recent_and_averaged_multiple_samples(self):
+        client = DirectRouterMetricsClient("http://localhost:8000/metrics", "ns")
+        client._sample_buffer = [
+            {"prefill": {"w1": {"active_prefill_tokens": 100.0}}},
+            {"prefill": {"w1": {"active_prefill_tokens": 200.0}}},
+            {"prefill": {"w1": {"active_prefill_tokens": 300.0}}},
+        ]
+        result = client.get_recent_and_averaged_metrics("prefill")
+        assert result is not None
+        recent, per_worker_avg, cluster_avg = result
+        # Recent should be the last sample
+        assert abs(recent["w1"]["active_prefill_tokens"] - 300.0) < 1e-6
+        # Per-worker averaged over time
+        assert abs(per_worker_avg["w1"]["active_prefill_tokens"] - 200.0) < 1e-6
+        # Cluster averaged (same as per-worker when only 1 worker)
+        assert abs(cluster_avg["active_prefill_tokens"] - 200.0) < 1e-6
+
+    def test_parse_multiple_workers(self):
+        client = DirectRouterMetricsClient("http://localhost:8000/metrics", "ns")
+        text = (
+            'dynamo_frontend_worker_active_prefill_tokens{dynamo_namespace="ns",model="M",worker_type="prefill",worker_id="w1"} 100\n'
+            'dynamo_frontend_worker_active_prefill_tokens{dynamo_namespace="ns",model="M",worker_type="prefill",worker_id="w2"} 200\n'
+        )
+        result = client._parse_prometheus_text(text)
+        assert len(result.get("prefill", {})) == 2
+        assert result["prefill"]["w1"]["active_prefill_tokens"] == 100.0
+        assert result["prefill"]["w2"]["active_prefill_tokens"] == 200.0
+
+    def test_parse_rust_labels_separates_worker_types(self):
+        """Rust KV router emits all metrics for all workers; parser must separate by worker_type."""
+        client = DirectRouterMetricsClient("http://localhost:8000/metrics", "ns")
+        text = (
+            "# HELP dynamo_frontend_worker_active_prefill_tokens Active prefill tokens\n"
+            "# TYPE dynamo_frontend_worker_active_prefill_tokens gauge\n"
+            'dynamo_frontend_worker_active_prefill_tokens{worker_id="123",dp_rank="0",worker_type="prefill"} 500\n'
+            'dynamo_frontend_worker_active_prefill_tokens{worker_id="456",dp_rank="0",worker_type="decode"} 0\n'
+            'dynamo_frontend_worker_active_decode_blocks{worker_id="123",dp_rank="0",worker_type="prefill"} 0\n'
+            'dynamo_frontend_worker_active_decode_blocks{worker_id="456",dp_rank="0",worker_type="decode"} 30\n'
+            'dynamo_frontend_worker_last_time_to_first_token_seconds{worker_id="123",dp_rank="0",worker_type="prefill"} 0.15\n'
+            'dynamo_frontend_worker_last_input_sequence_tokens{worker_id="123",dp_rank="0",worker_type="prefill"} 2000\n'
+            'dynamo_frontend_worker_last_inter_token_latency_seconds{worker_id="456",dp_rank="0",worker_type="decode"} 0.03\n'
+        )
+        result = client._parse_prometheus_text(text)
+
+        # Prefill worker 123 grouped under "prefill"
+        assert "prefill" in result
+        assert "123" in result["prefill"]
+        assert result["prefill"]["123"]["active_prefill_tokens"] == 500.0
+        assert result["prefill"]["123"]["last_ttft"] == 0.15
+        assert result["prefill"]["123"]["last_isl"] == 2000.0
+
+        # Decode worker 456 grouped under "decode"
+        assert "decode" in result
+        assert "456" in result["decode"]
+        assert result["decode"]["456"]["active_decode_blocks"] == 30.0
+        assert abs(result["decode"]["456"]["last_itl"] - 0.03) < 1e-6
+
+        # Cross-type metrics are stored under the correct worker_type
+        # (prefill worker's decode_blocks=0 stored under "prefill", not "decode")
+        assert "456" not in result["prefill"]
+        assert "123" not in result["decode"]
+
+
+# ── PrefillPlanner load-based scaling tests ─────────────────────────────
+
+
+@pytest.fixture(autouse=True)
+def mock_prometheus_metrics():
+    with patch("dynamo.planner.utils.planner_core.Gauge") as mock_gauge:
+        mock_gauge.return_value = Mock()
+        yield
+
+
+def _build_loadbased_args():
+    args = argparse.Namespace()
+    args.adjustment_interval = 60
+    args.prefill_engine_num_gpu = 1
+    args.decode_engine_num_gpu = 1
+    args.min_endpoint = 1
+    args.max_gpu_budget = -1
+    args.ttft = 500.0
+    args.itl = 50.0
+    args.backend = "vllm"
+    args.no_operation = True
+    args.no_correction = True
+    args.metric_pulling_prometheus_endpoint = "http://localhost:9090"
+    args.metric_reporting_prometheus_port = 0
+    args.load_predictor = "constant"
+    args.load_predictor_warmup_trace = None
+    args.profile_results_dir = os.path.join(
+        os.path.dirname(__file__),
+        "..",
+        "profiling_results",
+        "H200_TP1P_TP1D",
+    )
+    args.environment = "kubernetes"
+    args.namespace = "test-namespace"
+    args.mode = "disagg"
+    # Load-based scaling config
+    args.enable_loadbased_scaling = True
+    args.enable_throughput_scaling = True
+    args.disable_throughput_scaling = False
+    args.loadbased_router_metrics_url = "http://router:8000/metrics"
+    args.loadbased_adjustment_interval = 5
+    args.loadbased_learning_window = 50
+    args.loadbased_scaling_down_sensitivity = 80
+    args.loadbased_metric_samples = 10
+    args.loadbased_min_observations = 5
+    return args
+
+
+def _avg(per_worker: dict[str, dict[str, float]]) -> dict[str, float]:
+    """Compute flat averaged metrics from per-worker dicts (for test convenience)."""
+    sums: dict[str, float] = {}
+    counts: dict[str, int] = {}
+    for metrics in per_worker.values():
+        for k, v in metrics.items():
+            sums[k] = sums.get(k, 0.0) + v
+            counts[k] = counts.get(k, 0) + 1
+    return {k: sums[k] / counts[k] for k in sums}
+
+
+class TestPrefillLoadBasedScaling:
+    def test_scale_up_all_workers_above_target(self):
+        """When all workers have active_prefill_tokens above the regression target, scale up."""
+        args = _build_loadbased_args()
+        shared_state = PlannerSharedState()
+        shared_state.num_p_workers = 2
+
+        planner = PrefillPlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        # Feed regression data: TTFT = 0.1 * (active_prefill_tokens + ISL) + 100
+        # With TTFT SLA = 500ms: x_sla = (500 - 100) / 0.1 = 4000
+        # If ISL avg = 3000, target_active_tokens = 4000 - 3000 = 1000
+        for i in range(10):
+            x = 2000 + i * 200  # active_tokens + ISL
+            y = 0.1 * x + 100  # TTFT in ms
+            planner.ttft_regression.add_observation(x, y)
+
+        # Set per-worker metrics: all workers ABOVE target (1000)
+        metrics = {
+            "w1": {
+                "active_prefill_tokens": 1500.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.35,
+            },
+            "w2": {
+                "active_prefill_tokens": 1200.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.30,
+            },
+        }
+        planner.cached_load_metrics = CachedLoadMetrics(
+            recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
+        )
+
+        result = planner.loadbased_plan_adjustment()
+        assert result == 3  # scale up from 2 to 3
+
+    def test_scale_down_all_workers_below_boundary(self):
+        """When all workers are below the scale-down boundary, scale down."""
+        args = _build_loadbased_args()
+        args.loadbased_scaling_down_sensitivity = 100  # max sensitivity
+        shared_state = PlannerSharedState()
+        shared_state.num_p_workers = 3
+
+        planner = PrefillPlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        # Feed regression: TTFT = 0.1 * x + 100
+        # x_sla = (500-100)/0.1 = 4000, target = 4000-3000 = 1000
+        # boundary = 1000 * (3-1)/3 * 1.0 = 666.67
+        for i in range(10):
+            x = 2000 + i * 200
+            y = 0.1 * x + 100
+            planner.ttft_regression.add_observation(x, y)
+
+        # All workers below boundary (666.67)
+        metrics = {
+            "w1": {
+                "active_prefill_tokens": 100.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.15,
+            },
+            "w2": {
+                "active_prefill_tokens": 200.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.16,
+            },
+            "w3": {
+                "active_prefill_tokens": 150.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.15,
+            },
+        }
+        planner.cached_load_metrics = CachedLoadMetrics(
+            recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
+        )
+
+        result = planner.loadbased_plan_adjustment()
+        assert result == 2  # scale down from 3 to 2
+
+    def test_no_change_mixed_workers(self):
+        """When workers are mixed (some above, some below), no scaling."""
+        args = _build_loadbased_args()
+        shared_state = PlannerSharedState()
+        shared_state.num_p_workers = 2
+
+        planner = PrefillPlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        for i in range(10):
+            x = 2000 + i * 200
+            y = 0.1 * x + 100
+            planner.ttft_regression.add_observation(x, y)
+
+        # Mixed: one above target, one below
+        metrics = {
+            "w1": {
+                "active_prefill_tokens": 1500.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.35,
+            },
+            "w2": {
+                "active_prefill_tokens": 100.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.15,
+            },
+        }
+        planner.cached_load_metrics = CachedLoadMetrics(
+            recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
+        )
+
+        result = planner.loadbased_plan_adjustment()
+        assert result is None
+
+    def test_cold_start_returns_none(self):
+        """With insufficient data, loadbased_plan_adjustment returns None."""
+        args = _build_loadbased_args()
+        shared_state = PlannerSharedState()
+        shared_state.num_p_workers = 2
+
+        planner = PrefillPlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        # Only 2 observations (min is 5)
+        planner.ttft_regression.add_observation(1000.0, 200.0)
+        planner.ttft_regression.add_observation(2000.0, 300.0)
+
+        metrics = {
+            "w1": {
+                "active_prefill_tokens": 5000.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.5,
+            },
+        }
+        planner.cached_load_metrics = CachedLoadMetrics(
+            recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
+        )
+
+        result = planner.loadbased_plan_adjustment()
+        assert result is None
+
+
+class TestDecodeLoadBasedScaling:
+    def test_scale_up_all_workers_above_target(self):
+        """When all workers have active_decode_blocks above x_sla, scale up."""
+        args = _build_loadbased_args()
+        shared_state = PlannerSharedState()
+        shared_state.num_d_workers = 2
+
+        planner = DecodePlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        # Feed regression: ITL = 0.5 * active_decode_blocks + 10
+        # x_sla = (50 - 10) / 0.5 = 80
+        for i in range(10):
+            x = 20 + i * 10
+            y = 0.5 * x + 10
+            planner.itl_regression.add_observation(x, y)
+
+        # All workers above x_sla (80)
+        metrics = {
+            "w1": {"active_decode_blocks": 100.0, "last_itl": 0.06},
+            "w2": {"active_decode_blocks": 95.0, "last_itl": 0.055},
+        }
+        planner.cached_load_metrics = CachedLoadMetrics(
+            recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
+        )
+
+        result = planner.loadbased_plan_adjustment()
+        assert result == 3
+
+    def test_scale_down_all_workers_below_boundary(self):
+        """When all decode workers are below boundary, scale down."""
+        args = _build_loadbased_args()
+        args.loadbased_scaling_down_sensitivity = 100
+        shared_state = PlannerSharedState()
+        shared_state.num_d_workers = 3
+
+        planner = DecodePlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        # ITL = 0.5 * x + 10, x_sla = (50-10)/0.5 = 80
+        # boundary = 80 * (3-1)/3 * 1.0 = 53.33
+        for i in range(10):
+            x = 20 + i * 10
+            y = 0.5 * x + 10
+            planner.itl_regression.add_observation(x, y)
+
+        # All workers below boundary (53.33)
+        metrics = {
+            "w1": {"active_decode_blocks": 10.0, "last_itl": 0.02},
+            "w2": {"active_decode_blocks": 15.0, "last_itl": 0.025},
+            "w3": {"active_decode_blocks": 20.0, "last_itl": 0.03},
+        }
+        planner.cached_load_metrics = CachedLoadMetrics(
+            recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
+        )
+
+        result = planner.loadbased_plan_adjustment()
+        assert result == 2
+
+    def test_cold_start_returns_none(self):
+        """Decode cold start also returns None."""
+        args = _build_loadbased_args()
+        shared_state = PlannerSharedState()
+        shared_state.num_d_workers = 2
+
+        planner = DecodePlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        planner.itl_regression.add_observation(10.0, 15.0)
+
+        metrics = {
+            "w1": {"active_decode_blocks": 200.0, "last_itl": 0.1},
+        }
+        planner.cached_load_metrics = CachedLoadMetrics(
+            recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
+        )
+
+        result = planner.loadbased_plan_adjustment()
+        assert result is None
+
+
+class TestLowerBoundEnforcement:
+    def test_throughput_lower_bound_respected(self):
+        """Load-based scaling should never go below throughput lower bound."""
+        args = _build_loadbased_args()
+        shared_state = PlannerSharedState()
+        shared_state.num_p_workers = 5
+        # Throughput says we need at least 4 prefill workers
+        shared_state.throughput_lower_bound_p = 4
+
+        planner = PrefillPlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        # Regression says we should scale down to 4 (from 5)
+        for i in range(10):
+            x = 2000 + i * 200
+            y = 0.1 * x + 100
+            planner.ttft_regression.add_observation(x, y)
+
+        # Workers all lightly loaded => wants to scale down to 4
+        metrics = {
+            f"w{i}": {
+                "active_prefill_tokens": 50.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.12,
+            }
+            for i in range(5)
+        }
+        planner.cached_load_metrics = CachedLoadMetrics(
+            recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
+        )
+
+        result = planner.loadbased_plan_adjustment()
+        # Even though load-based wants to scale down, the result should be
+        # at least 4 after lower bound enforcement (done in the loop, not in
+        # loadbased_plan_adjustment itself)
+        # loadbased_plan_adjustment returns raw desired value
+        assert result == 4  # raw value from load-based
+
+    def test_scaling_down_sensitivity_zero_never_scales_down(self):
+        """With sensitivity=0, scale-down boundary is 0 so never scale down."""
+        args = _build_loadbased_args()
+        args.loadbased_scaling_down_sensitivity = 0
+        shared_state = PlannerSharedState()
+        shared_state.num_p_workers = 3
+
+        planner = PrefillPlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        for i in range(10):
+            x = 2000 + i * 200
+            y = 0.1 * x + 100
+            planner.ttft_regression.add_observation(x, y)
+
+        # All workers at zero load
+        metrics = {
+            f"w{i}": {
+                "active_prefill_tokens": 0.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.12,
+            }
+            for i in range(3)
+        }
+        planner.cached_load_metrics = CachedLoadMetrics(
+            recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
+        )
+
+        # boundary = target * (3-1)/3 * 0/100 = 0
+        # all workers at 0 which is NOT less than 0 (it's equal)
+        result = planner.loadbased_plan_adjustment()
+        assert result is None  # no scaling happens
+
+
+# ── Correction factor auto-disable tests ─────────────────────────────
+
+
+class TestCorrectionFactorAutoDisable:
+    def test_correction_factor_disabled_when_loadbased_enabled(self):
+        """Correction factor should be auto-disabled when load-based scaling is on."""
+        args = _build_loadbased_args()
+        args.no_correction = False  # user didn't explicitly disable
+        validate_sla_planner_args(args)
+        assert args.no_correction is True
+
+    def test_correction_factor_stays_disabled_if_already_set(self):
+        """If user already set --no-correction, no extra warning needed."""
+        args = _build_loadbased_args()
+        args.no_correction = True  # user explicitly set
+        validate_sla_planner_args(args)
+        assert args.no_correction is True
+
+    def test_correction_factor_not_disabled_without_loadbased(self):
+        """Without load-based scaling, correction factor should respect user setting."""
+        args = _build_loadbased_args()
+        args.enable_loadbased_scaling = False
+        args.no_correction = False
+        validate_sla_planner_args(args)
+        assert args.no_correction is False
+
+
+# ── DGD worker count reconciliation tests ────────────────────────────
+
+
+class TestWorkerCountReconciliation:
+    async def test_prefill_observe_gets_only_prefill_workers(self):
+        """observe_engine_load_stats for prefill queries get_recent_and_averaged_metrics('prefill')."""
+        args = _build_loadbased_args()
+        shared_state = PlannerSharedState()
+        shared_state.num_p_workers = 1
+
+        planner = PrefillPlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        # get_recent_and_averaged_metrics("prefill") returns (recent, per_worker_avg, cluster_avg)
+        prefill_metrics = {
+            "w1": {
+                "active_prefill_tokens": 500.0,
+                "last_ttft": 0.2,
+                "last_isl": 3000.0,
+            },
+        }
+        planner.prometheus_engine_client = Mock()
+        planner.prometheus_engine_client.get_recent_and_averaged_metrics.return_value = (
+            prefill_metrics,
+            prefill_metrics,
+            _avg(prefill_metrics),
+        )
+
+        await planner.observe_engine_load_stats()
+
+        planner.prometheus_engine_client.get_recent_and_averaged_metrics.assert_called_once_with(
+            "prefill"
+        )
+        assert len(planner.cached_load_metrics.recent) == 1
+        assert "w1" in planner.cached_load_metrics.recent
+
+    async def test_decode_observe_gets_only_decode_workers(self):
+        """observe_engine_load_stats for decode queries get_recent_and_averaged_metrics('decode')."""
+        args = _build_loadbased_args()
+        shared_state = PlannerSharedState()
+        shared_state.num_d_workers = 1
+
+        planner = DecodePlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        decode_metrics = {
+            "w2": {"active_decode_blocks": 50.0, "last_itl": 0.04},
+        }
+        planner.prometheus_engine_client = Mock()
+        planner.prometheus_engine_client.get_recent_and_averaged_metrics.return_value = (
+            decode_metrics,
+            decode_metrics,
+            _avg(decode_metrics),
+        )
+
+        await planner.observe_engine_load_stats()
+
+        planner.prometheus_engine_client.get_recent_and_averaged_metrics.assert_called_once_with(
+            "decode"
+        )
+        assert len(planner.cached_load_metrics.recent) == 1
+        assert "w2" in planner.cached_load_metrics.recent
+
+    def test_worker_count_mismatch_detected(self):
+        """When DGD and Prometheus worker counts differ, the mismatch should be detectable."""
+        args = _build_loadbased_args()
+        shared_state = PlannerSharedState()
+        # DGD says 3 prefill workers
+        shared_state.num_p_workers = 3
+
+        planner = PrefillPlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        # But router only reports 2 prefill workers
+        metrics = {
+            "w1": {
+                "active_prefill_tokens": 500.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.2,
+            },
+            "w2": {
+                "active_prefill_tokens": 600.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.25,
+            },
+        }
+        planner.cached_load_metrics = CachedLoadMetrics(
+            recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
+        )
+
+        # The mismatch should be detectable by comparing counts
+        prom_count = len(planner.cached_load_metrics.recent)
+        dgd_count = shared_state.num_p_workers
+        assert prom_count != dgd_count
+        assert prom_count == 2
+        assert dgd_count == 3
+
+    def test_worker_count_match_allows_scaling(self):
+        """When DGD and Prometheus counts match, scaling proceeds normally."""
+        args = _build_loadbased_args()
+        shared_state = PlannerSharedState()
+        shared_state.num_p_workers = 2
+
+        planner = PrefillPlanner(None, args, shared_state=shared_state)
+        planner.model_name = "test-model"
+
+        metrics = {
+            "w1": {
+                "active_prefill_tokens": 1500.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.35,
+            },
+            "w2": {
+                "active_prefill_tokens": 1200.0,
+                "last_isl": 3000.0,
+                "last_ttft": 0.30,
+            },
+        }
+        planner.cached_load_metrics = CachedLoadMetrics(
+            recent=metrics, per_worker_averaged=metrics, cluster_averaged=_avg(metrics)
+        )
+
+        prom_count = len(planner.cached_load_metrics.recent)
+        dgd_count = shared_state.num_p_workers
+        assert prom_count == dgd_count
+
+        # With matching counts and sufficient regression data, scaling should work
+        for i in range(10):
+            x = 2000 + i * 200
+            y = 0.1 * x + 100
+            planner.ttft_regression.add_observation(x, y)
+
+        result = planner.loadbased_plan_adjustment()
+        assert result is not None  # scaling proceeds
--- a/tests/planner/unit/test_sla_planner_scaling.py
+++ b/tests/planner/unit/test_sla_planner_scaling.py
@@ -9,13 +9,10 @@ from unittest.mock import Mock, patch

 import pytest

+from dynamo.planner.utils.decode_planner import DecodePlanner
 from dynamo.planner.utils.exceptions import DeploymentValidationError
-from dynamo.planner.utils.planner_core import (
-    DecodePlanner,
-    PlannerSharedState,
-    PrefillPlanner,
-    _initialize_gpu_counts,
-)
+from dynamo.planner.utils.planner_core import PlannerSharedState, _initialize_gpu_counts
+from dynamo.planner.utils.prefill_planner import PrefillPlanner

 pytestmark = [
    pytest.mark.gpu_0,
@@ -82,8 +79,8 @@ def _build_planners(args, prometheus_client):
    shared_state = PlannerSharedState()
    prefill_planner = PrefillPlanner(None, args, shared_state=shared_state)
    decode_planner = DecodePlanner(None, args, shared_state=shared_state)
-    prefill_planner.prometheus_api_client = prometheus_client
-    decode_planner.prometheus_api_client = prometheus_client
+    prefill_planner.prometheus_traffic_client = prometheus_client
+    decode_planner.prometheus_traffic_client = prometheus_client
    prefill_planner.model_name = "test-model"
    decode_planner.model_name = "test-model"

@@ -131,7 +128,7 @@ def _expected_decode(args, decode_planner, sample):

 def _run_interval(prefill_planner, decode_planner, shared_state):
    asyncio.run(
-        prefill_planner.observe_metrics(require_prefill=True, require_decode=True)
+        prefill_planner.observe_traffic_stats(require_prefill=True, require_decode=True)
    )
    decode_planner.update_predictors_from_metrics(shared_state.last_metrics)
    next_num_p = prefill_planner.plan_adjustment()

--- a/tests/planner/utils/load_generator.py
+++ b/tests/planner/utils/load_generator.py
@@ -230,7 +230,7 @@ class LoadGenerator:
            logger.warning(f"Failed to parse aiperf results: {e}")
            return {}

-    async def run_scaling_test(self) -> Dict[str, Any]:
+    async def run_scaling_test(self, mode: str = "throughput") -> Dict[str, Any]:
        """
        Run a graduated scaling test for prefill scaling.

@@ -238,17 +238,23 @@ class LoadGenerator:
        - Phase 1: 8 req/s (baseline, should maintain 1P1D)
        - Phase 2: 18 req/s (should trigger prefill scaling to 2P1D)

+        Args:
+            mode: Scaling mode - "throughput" or "load".
+                  "load" uses a longer baseline for regression warmup.
+
        Returns:
            Dictionary with complete test results
        """
        logger.info(
-            "Starting graduated prefill scaling test scenario (targeting 1P1D -> 2P1D)"
+            f"Starting graduated prefill scaling test scenario (targeting 1P1D -> 2P1D, mode={mode})"
        )
        logger.info("Using conservative graduated approach with metric generation")

        # Graduated test parameters (optimized for prefill scaling)
+        # Load-based scaling needs longer baseline for regression warmup
+        baseline_duration = 120 if mode == "load" else 90
        phases: List[Dict[str, Any]] = [
-            {"rate": 8.0, "duration": 90, "name": "baseline"},
+            {"rate": 8.0, "duration": baseline_duration, "name": "baseline"},
            {"rate": 18.0, "duration": 120, "name": "prefill_scaling_trigger"},
        ]
        transition_delay = 30