feat(planner): add optimization_target for easy-mode scaling (#8137)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

feat(planner): add optimization_target for easy-mode scaling (#8137)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
9cf9ef18 · Hongkuan Zhou · GitHub · 2cabf441 · 9cf9ef18 · 9cf9ef18
Unverified Commit 9cf9ef18 authored Apr 13, 2026 by Hongkuan Zhou Committed by GitHub Apr 14, 2026
16 changed files
--- a/components/src/dynamo/planner/config/planner_config.py
+++ b/components/src/dynamo/planner/config/planner_config.py
@@ -55,6 +55,15 @@ class PlannerConfig(BaseModel):
    )
    backend: Literal["vllm", "sglang", "trtllm", "mocker"] = SLAPlannerDefaults.backend
    mode: Literal["disagg", "prefill", "decode", "agg"] = SLAPlannerDefaults.mode
+    optimization_target: Literal["throughput", "latency", "sla"] = Field(
+        default="throughput",
+        description=(
+            "Scaling optimization target. "
+            "'throughput' (default) and 'latency' use static thresholds on queue "
+            "depth and KV cache utilization — no SLA targets or profiling needed. "
+            "'sla' uses regression-based scaling that targets specific ttft/itl values."
+        ),
+    )
    no_operation: bool = SLAPlannerDefaults.no_operation
    log_dir: Optional[str] = SLAPlannerDefaults.log_dir
@@ -163,6 +172,20 @@ class PlannerConfig(BaseModel):
                "Please specify the namespace where GlobalPlanner is running."
            )
+        # Easy mode: force load scaling on, throughput scaling off
+        if self.optimization_target != "sla":
+            self.enable_load_scaling = True
+            self.enable_throughput_scaling = False
+            if (
+                self.ttft != SLAPlannerDefaults.ttft
+                or self.itl != SLAPlannerDefaults.itl
+            ):
+                logger.warning(
+                    "optimization_target=%s ignores ttft/itl values; "
+                    "set optimization_target='sla' to use SLA-based scaling",
+                    self.optimization_target,
+                )
        # At least one scaling mode must be enabled
        if not self.enable_throughput_scaling and not self.enable_load_scaling:
            raise ValueError(

--- a/components/src/dynamo/planner/core/base.py
+++ b/components/src/dynamo/planner/core/base.py
@@ -77,6 +77,7 @@ def _engine_caps(
        else None,
        max_num_seqs=worker_info.max_num_seqs if worker_info else None,
        context_length=worker_info.context_length if worker_info else None,
+        max_kv_tokens=worker_info.max_kv_tokens if worker_info else None,
    )

--- a/components/src/dynamo/planner/core/load_scaling.py
+++ b/components/src/dynamo/planner/core/load_scaling.py
@@ -20,6 +20,19 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
+# -- Easy-mode static thresholds (optimization_target != "sla") -----------
+# Prefill: ratio of queued_prefill_tokens / context_length
+_PREFILL_THROUGHPUT_SCALE_UP = 1.0  # queued >= context_length
+_PREFILL_THROUGHPUT_SCALE_DOWN = 0.1  # queued < context_length / 10
+_PREFILL_LATENCY_SCALE_UP = 0.1  # queued >= context_length / 10
+_PREFILL_LATENCY_SCALE_DOWN = 0.0  # queued == 0
+# Decode/Agg: KV cache utilization (scheduled + queued) / max_kv_tokens
+_DECODE_THROUGHPUT_SCALE_UP = 1.0  # util > 100%
+_DECODE_THROUGHPUT_SCALE_DOWN = 0.6  # util < 60%
+_DECODE_LATENCY_SCALE_UP = 0.4  # util > 40%
+_DECODE_LATENCY_SCALE_DOWN = 0.1  # util < 10%
 class LoadScalingMixin:
    """FPM-driven load-based scaling decisions."""
@@ -60,11 +73,19 @@ class LoadScalingMixin:
            self._diag_load_reason = "worker_count_mismatch"
            return None
-        desired = (
+        easy = self._config.optimization_target != "sla"
-            self._prefill_load_decision(fpm_stats, num_workers)
+        if easy:
-            if component == "prefill"
+            desired = (
-            else self._decode_load_decision(fpm_stats, num_workers)
+                self._prefill_easy_decision(fpm_stats, num_workers)
-        )
+                if component == "prefill"
+                else self._decode_easy_decision(fpm_stats, num_workers)
+            )
+        else:
+            desired = (
+                self._prefill_load_decision(fpm_stats, num_workers)
+                if component == "prefill"
+                else self._decode_load_decision(fpm_stats, num_workers)
+            )
        if desired is None:
            return None
@@ -113,13 +134,22 @@ class LoadScalingMixin:
            self._diag_load_reason = "worker_count_mismatch"
            return None
+        easy = self._config.optimization_target != "sla"
        p_desired = (
-            self._prefill_load_decision(p_stats, self._num_p_workers)
+            (
+                self._prefill_easy_decision(p_stats, self._num_p_workers)
+                if easy
+                else self._prefill_load_decision(p_stats, self._num_p_workers)
+            )
            if p_stats
            else None
        )
        d_desired = (
-            self._decode_load_decision(d_stats, self._num_d_workers)
+            (
+                self._decode_easy_decision(d_stats, self._num_d_workers)
+                if easy
+                else self._decode_load_decision(d_stats, self._num_d_workers)
+            )
            if d_stats
            else None
        )
@@ -174,6 +204,34 @@ class LoadScalingMixin:
        if not self._reconcile_fpm_worker_count(fpm_stats, num_workers, "agg"):
            self._diag_load_reason = "worker_count_mismatch"
            return None
+        easy = self._config.optimization_target != "sla"
+        if easy:
+            desired = self._agg_easy_decision(fpm_stats, num_workers)
+            # For agg easy mode, we directly get a single decision
+            # _agg_easy_decision already sets _diag_load_reason before returning None
+            if desired is None:
+                return None
+            original_desired = desired
+            desired = max(desired, self._config.min_endpoint)
+            if self._config.enable_throughput_scaling:
+                desired = max(desired, self._throughput_lower_bound_d)
+            desired = self._apply_single_budget(desired, "decode")
+            if desired < num_workers:
+                if desired > original_desired:
+                    self._diag_load_reason = "scale_down_capped_by_throughput"
+                else:
+                    self._diag_load_reason = "scale_down"
+            elif desired > num_workers:
+                self._diag_load_reason = "scale_up"
+            else:
+                self._diag_load_reason = "no_change"
+            logger.info(f"Agg easy-mode scaling: {num_workers} -> {desired}")
+            return ScalingDecision(num_decode=desired)
        if not self._agg_regression.has_sufficient_data():
            logger.info(
                f"Agg regression: insufficient data "
@@ -358,6 +416,193 @@ class LoadScalingMixin:
        return self._scale_decision(estimates, self._config.itl, num_workers, "agg ITL")
+    # ------------------------------------------------------------------
+    # Easy-mode decision methods (optimization_target != "sla")
+    # ------------------------------------------------------------------
+    def _prefill_easy_decision(
+        self, fpm_stats: dict[tuple[str, int], ForwardPassMetrics], num_workers: int
+    ) -> Optional[int]:
+        p_caps = self._capabilities.prefill
+        ctx_len = p_caps.context_length if p_caps else None
+        if not ctx_len or ctx_len <= 0:
+            logger.warning(
+                "context_length not available, skipping easy prefill scaling"
+            )
+            self._diag_load_reason = "insufficient_data"
+            return None
+        if num_workers == 0:
+            self._diag_load_reason = "insufficient_data"
+            return None
+        is_latency = self._config.optimization_target == "latency"
+        up_thresh = (
+            _PREFILL_LATENCY_SCALE_UP if is_latency else _PREFILL_THROUGHPUT_SCALE_UP
+        )
+        down_thresh = (
+            _PREFILL_LATENCY_SCALE_DOWN
+            if is_latency
+            else _PREFILL_THROUGHPUT_SCALE_DOWN
+        )
+        ratios: list[float] = []
+        for (wid, dp), fpm in fpm_stats.items():
+            queued = fpm.queued_requests.sum_prefill_tokens
+            ratio = queued / ctx_len
+            ratios.append(ratio)
+            logger.info(
+                f"Easy prefill {wid}:dp{dp}: queued={queued}, "
+                f"context_length={ctx_len}, ratio={ratio:.3f}"
+            )
+        if not ratios:
+            self._diag_load_reason = "insufficient_data"
+            return None
+        # Scale up if ANY engine above threshold
+        if any(r >= up_thresh for r in ratios):
+            logger.info(
+                f"Easy prefill: engine(s) above scale-up threshold "
+                f"({up_thresh}), scaling up to {num_workers + 1}"
+            )
+            return num_workers + 1
+        # Scale down if ALL engines below threshold
+        if num_workers > 1:
+            if is_latency:
+                # For latency mode, scale down when ALL queues are empty
+                if all(r <= down_thresh for r in ratios):
+                    desired = max(num_workers - 1, self._config.min_endpoint)
+                    logger.info(
+                        f"Easy prefill: all engines at zero queue, -> {desired}"
+                    )
+                    return desired
+            else:
+                if all(r < down_thresh for r in ratios):
+                    desired = max(num_workers - 1, self._config.min_endpoint)
+                    logger.info(
+                        f"Easy prefill: all engines below scale-down threshold "
+                        f"({down_thresh}), -> {desired}"
+                    )
+                    return desired
+        self._diag_load_reason = "no_change"
+        return None
+    def _decode_easy_decision(
+        self, fpm_stats: dict[tuple[str, int], ForwardPassMetrics], num_workers: int
+    ) -> Optional[int]:
+        d_caps = self._capabilities.decode
+        max_kv = d_caps.max_kv_tokens if d_caps else None
+        if not max_kv or max_kv <= 0:
+            logger.warning("max_kv_tokens not available, skipping easy decode scaling")
+            self._diag_load_reason = "insufficient_data"
+            return None
+        if num_workers == 0:
+            self._diag_load_reason = "insufficient_data"
+            return None
+        is_latency = self._config.optimization_target == "latency"
+        up_thresh = (
+            _DECODE_LATENCY_SCALE_UP if is_latency else _DECODE_THROUGHPUT_SCALE_UP
+        )
+        down_thresh = (
+            _DECODE_LATENCY_SCALE_DOWN if is_latency else _DECODE_THROUGHPUT_SCALE_DOWN
+        )
+        utils: list[float] = []
+        for (wid, dp), fpm in fpm_stats.items():
+            sched_kv = fpm.scheduled_requests.sum_decode_kv_tokens
+            queued_kv = fpm.queued_requests.sum_decode_kv_tokens
+            util = (sched_kv + queued_kv) / max_kv
+            utils.append(util)
+            logger.info(
+                f"Easy decode {wid}:dp{dp}: sched_kv={sched_kv}, "
+                f"queued_kv={queued_kv}, max_kv={max_kv}, util={util:.3f}"
+            )
+        if not utils:
+            self._diag_load_reason = "insufficient_data"
+            return None
+        if any(u > up_thresh for u in utils):
+            logger.info(
+                f"Easy decode: engine(s) above scale-up threshold "
+                f"({up_thresh}), scaling up to {num_workers + 1}"
+            )
+            return num_workers + 1
+        if num_workers > 1 and all(u < down_thresh for u in utils):
+            desired = max(num_workers - 1, self._config.min_endpoint)
+            logger.info(
+                f"Easy decode: all engines below scale-down threshold "
+                f"({down_thresh}), -> {desired}"
+            )
+            return desired
+        self._diag_load_reason = "no_change"
+        return None
+    def _agg_easy_decision(
+        self, fpm_stats: dict[tuple[str, int], ForwardPassMetrics], num_workers: int
+    ) -> Optional[int]:
+        """Easy-mode decision for agg: uses combined KV utilization including queued prefill."""
+        d_caps = self._capabilities.decode
+        max_kv = d_caps.max_kv_tokens if d_caps else None
+        if not max_kv or max_kv <= 0:
+            logger.warning("max_kv_tokens not available, skipping easy agg scaling")
+            self._diag_load_reason = "insufficient_data"
+            return None
+        if num_workers == 0:
+            self._diag_load_reason = "insufficient_data"
+            return None
+        is_latency = self._config.optimization_target == "latency"
+        up_thresh = (
+            _DECODE_LATENCY_SCALE_UP if is_latency else _DECODE_THROUGHPUT_SCALE_UP
+        )
+        down_thresh = (
+            _DECODE_LATENCY_SCALE_DOWN if is_latency else _DECODE_THROUGHPUT_SCALE_DOWN
+        )
+        utils: list[float] = []
+        for (wid, dp), fpm in fpm_stats.items():
+            sched_kv = fpm.scheduled_requests.sum_decode_kv_tokens
+            queued_kv = fpm.queued_requests.sum_decode_kv_tokens
+            queued_prefill = fpm.queued_requests.sum_prefill_tokens
+            util = (sched_kv + queued_kv + queued_prefill) / max_kv
+            utils.append(util)
+            logger.info(
+                f"Easy agg {wid}:dp{dp}: sched_kv={sched_kv}, queued_kv={queued_kv}, "
+                f"queued_prefill={queued_prefill}, max_kv={max_kv}, util={util:.3f}"
+            )
+        if not utils:
+            self._diag_load_reason = "insufficient_data"
+            return None
+        if any(u > up_thresh for u in utils):
+            logger.info(
+                f"Easy agg: engine(s) above scale-up threshold "
+                f"({up_thresh}), scaling up to {num_workers + 1}"
+            )
+            return num_workers + 1
+        if num_workers > 1 and all(u < down_thresh for u in utils):
+            desired = max(num_workers - 1, self._config.min_endpoint)
+            logger.info(
+                f"Easy agg: all engines below scale-down threshold "
+                f"({down_thresh}), -> {desired}"
+            )
+            return desired
+        self._diag_load_reason = "no_change"
+        return None
+    # ------------------------------------------------------------------
+    # SLA-based per-engine latency estimation
+    # ------------------------------------------------------------------
    def _scale_decision(
        self, estimates: list[float], sla: float, num_workers: int, label: str
    ) -> Optional[int]:

--- a/components/src/dynamo/planner/core/state_machine.py
+++ b/components/src/dynamo/planner/core/state_machine.py
@@ -66,31 +66,34 @@ class PlannerStateMachine(LoadScalingMixin, ThroughputScalingMixin):
        self._is_agg = config.mode == "agg"
        self._has_prefill = config.mode in ("disagg", "prefill")
        self._has_decode = config.mode in ("disagg", "decode", "agg")
+        self._is_easy = config.optimization_target != "sla"
-        if self._is_agg:
+        # Easy mode uses static thresholds -- no regression or predictors needed
-            self._agg_regression = AggRegressionModel(
+        if not self._is_easy:
-                max_num_fpm_samples=config.max_num_fpm_samples,
+            if self._is_agg:
-                min_observations=config.load_min_observations,
+                self._agg_regression = AggRegressionModel(
-                bucket_count=config.fpm_sample_bucket_size,
-            )
-        else:
-            if self._has_prefill:
-                self._prefill_regression = PrefillRegressionModel(
                    max_num_fpm_samples=config.max_num_fpm_samples,
                    min_observations=config.load_min_observations,
                    bucket_count=config.fpm_sample_bucket_size,
                )
-            if self._has_decode:
+            else:
-                self._decode_regression = DecodeRegressionModel(
+                if self._has_prefill:
-                    max_num_fpm_samples=config.max_num_fpm_samples,
+                    self._prefill_regression = PrefillRegressionModel(
-                    min_observations=config.load_min_observations,
+                        max_num_fpm_samples=config.max_num_fpm_samples,
-                    bucket_count=config.fpm_sample_bucket_size,
+                        min_observations=config.load_min_observations,
-                )
+                        bucket_count=config.fpm_sample_bucket_size,
+                    )
-        predictor_cls = LOAD_PREDICTORS[config.load_predictor]
+                if self._has_decode:
-        self._num_req_predictor = predictor_cls(config)
+                    self._decode_regression = DecodeRegressionModel(
-        self._isl_predictor = predictor_cls(config)
+                        max_num_fpm_samples=config.max_num_fpm_samples,
-        self._osl_predictor = predictor_cls(config)
+                        min_observations=config.load_min_observations,
+                        bucket_count=config.fpm_sample_bucket_size,
+                    )
+            predictor_cls = LOAD_PREDICTORS[config.load_predictor]
+            self._num_req_predictor = predictor_cls(config)
+            self._isl_predictor = predictor_cls(config)
+            self._osl_predictor = predictor_cls(config)
        self._num_p_workers: int = 0
        self._num_d_workers: int = 0
@@ -132,6 +135,9 @@ class PlannerStateMachine(LoadScalingMixin, ThroughputScalingMixin):
        decode_fpms: Optional[list[ForwardPassMetrics]] = None,
        agg_fpms: Optional[list[ForwardPassMetrics]] = None,
    ) -> None:
+        if self._is_easy:
+            logger.debug("Skipping benchmark FPM loading in easy mode")
+            return
        if agg_fpms and self._is_agg:
            self._agg_regression.load_benchmark_fpms(agg_fpms)
            logger.info(f"Bootstrapped agg regression with {len(agg_fpms)} FPMs")
@@ -145,6 +151,9 @@ class PlannerStateMachine(LoadScalingMixin, ThroughputScalingMixin):
            logger.info(f"Bootstrapped decode regression with {len(decode_fpms)} FPMs")
    def warm_load_predictors(self, observations: list[TrafficObservation]) -> None:
+        if self._is_easy:
+            logger.debug("Skipping load predictor warmup in easy mode")
+            return
        for obs in observations:
            self._num_req_predictor.add_data_point(obs.num_req)
            self._isl_predictor.add_data_point(obs.isl)
@@ -163,7 +172,8 @@ class PlannerStateMachine(LoadScalingMixin, ThroughputScalingMixin):
        if tick.run_load_scaling:
            if tick_input.fpm_observations is not None:
-                self._observe_fpm(tick_input.fpm_observations)
+                if not self._is_easy:
+                    self._observe_fpm(tick_input.fpm_observations)
                load_decision = self._advance_load(tick_input.fpm_observations)
                if load_decision is not None:
                    effects.scale_to = load_decision

--- a/components/src/dynamo/planner/core/types.py
+++ b/components/src/dynamo/planner/core/types.py
@@ -134,6 +134,7 @@ class EngineCapabilities:
    max_num_batched_tokens: Optional[int] = None
    max_num_seqs: Optional[int] = None
    context_length: Optional[int] = None
+    max_kv_tokens: Optional[int] = None
 @dataclass

--- a/components/src/dynamo/planner/tests/unit/test_easy_scaling.py
+++ b/components/src/dynamo/planner/tests/unit/test_easy_scaling.py
--- a/components/src/dynamo/planner/tests/unit/test_planner_config.py
+++ b/components/src/dynamo/planner/tests/unit/test_planner_config.py
@@ -114,6 +114,7 @@ def test_agg_mode_supports_throughput_scaling():
    config = PlannerConfig(
        namespace="test-ns",
        mode="agg",
+        optimization_target="sla",
        enable_throughput_scaling=True,
        enable_load_scaling=False,
    )

--- a/components/src/dynamo/planner/tests/unit/test_state_machine.py
+++ b/components/src/dynamo/planner/tests/unit/test_state_machine.py
@@ -85,6 +85,7 @@ def _make_fpm(
 def _make_config(**overrides) -> PlannerConfig:
    defaults = dict(
        mode="disagg",
+        optimization_target="sla",
        ttft=500.0,
        itl=50.0,
        min_endpoint=1,

--- a/components/src/dynamo/profiler/tests/data/configs/3_rapid_supported_planner_rapid_sweep.yaml
+++ b/components/src/dynamo/profiler/tests/data/configs/3_rapid_supported_planner_rapid_sweep.yaml
@@ -13,6 +13,7 @@ sla:
  itl: 50.0
 features:
  planner:
+    optimization_target: sla
    pre_deployment_sweeping_mode: rapid
    enable_throughput_scaling: true
    enable_load_scaling: false

--- a/components/src/dynamo/profiler/tests/data/configs/3b_rapid_supported_planner_rapid_sweep_mocker.yaml
+++ b/components/src/dynamo/profiler/tests/data/configs/3b_rapid_supported_planner_rapid_sweep_mocker.yaml
@@ -13,6 +13,7 @@ sla:
  itl: 50.0
 features:
  planner:
+    optimization_target: sla
    pre_deployment_sweeping_mode: rapid
    enable_throughput_scaling: true
    enable_load_scaling: false

--- a/components/src/dynamo/profiler/tests/data/configs/5b_rapid_unsupported_planner_throughput_error.yaml
+++ b/components/src/dynamo/profiler/tests/data/configs/5b_rapid_unsupported_planner_throughput_error.yaml
@@ -15,6 +15,7 @@ sla:
  itl: 50.0
 features:
  planner:
+    optimization_target: sla
    pre_deployment_sweeping_mode: rapid
    enable_throughput_scaling: true
    enable_load_scaling: false

--- a/components/src/dynamo/profiler/tests/data/configs/7_thorough_planner_rapid_sweep.yaml
+++ b/components/src/dynamo/profiler/tests/data/configs/7_thorough_planner_rapid_sweep.yaml
@@ -17,6 +17,7 @@ sla:
 searchStrategy: thorough
 features:
  planner:
+    optimization_target: sla
    pre_deployment_sweeping_mode: rapid
    enable_throughput_scaling: true
    enable_load_scaling: false

--- a/components/src/dynamo/profiler/tests/data/configs/7b_thorough_planner_thorough_sweep.yaml
+++ b/components/src/dynamo/profiler/tests/data/configs/7b_thorough_planner_thorough_sweep.yaml
@@ -15,6 +15,7 @@ sla:
 searchStrategy: thorough
 features:
  planner:
+    optimization_target: sla
    pre_deployment_sweeping_mode: thorough
    enable_throughput_scaling: true
    enable_load_scaling: false

--- a/components/src/dynamo/profiler/tests/unit/test_helpers_profile_sla.py
+++ b/components/src/dynamo/profiler/tests/unit/test_helpers_profile_sla.py
@@ -77,6 +77,7 @@ def _make_dgdr(**overrides) -> DynamoGraphDeploymentRequestSpec:
 def _make_planner(**overrides) -> PlannerConfig:
    base = dict(
+        optimization_target="sla",
        enable_throughput_scaling=True,
        enable_load_scaling=False,
        pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid,

--- a/docs/components/planner/README.md
+++ b/docs/components/planner/README.md
@@ -17,6 +17,18 @@ LLM inference breaks these assumptions:
 The Dynamo **Planner** is an autoscaler purpose-built for these constraints. It understands engine profiling data, tracks per-worker GPU utilization, predicts traffic patterns, and makes scaling decisions that directly target TTFT and ITL SLAs — not proxy metrics.
+## Getting Started: Optimization Targets
+The planner offers three `optimization_target` settings that control how scaling decisions are made:
+| Target | Description | Requires SLA? | Requires Profiling? |
+|--------|-------------|:-------------:|:-------------------:|
+| **`throughput`** (default) | Maximizes throughput by scaling based on queue depth and KV cache utilization. Scales up when engines are saturated, scales down when utilization drops. | No | No |
+| **`latency`** | Minimizes latency by scaling aggressively to keep queues short. Scales up at lower utilization thresholds. | No | No |
+| **`sla`** | Targets specific TTFT/ITL SLA values using regression-based performance models. Most precise, but requires configuration. | Yes (`ttft`, `itl`) | Recommended |
+**We recommend starting with the default `throughput` target** — it works out of the box with zero configuration. Switch to `latency` if your workload is latency-sensitive, or to `sla` when you need precise SLA targeting with pre-deployment profiling.
 > **New to the Planner?** Start with the [Planner Guide](planner-guide.md) for a complete workflow including profiling and deployment.
 > **Need multi-DGD coordination?** See the [Global Planner Guide](global-planner.md) for shared-policy coordination across multiple DGDs and single-endpoint multi-pool deployments.
@@ -63,40 +75,51 @@ When both modes are enabled, throughput-based scaling provides a capacity floor
 - Dynamo platform installed on Kubernetes ([Installation Guide](../../kubernetes/installation-guide.md))
 - kube-prometheus-stack installed ([Metrics Setup](../../kubernetes/observability/metrics.md))
-For throughput-based scaling, pre-deployment engine performance data is also required (via self-benchmark mode or [Profiling Guide](../profiler/profiler-guide.md)).
+### Default Mode (zero config)
-### Throughput-Based Scaling (with DGDR)
-The fastest path to a throughput-based planner deployment is through a DynamoGraphDeploymentRequest, which automatically profiles your model:
+The planner works out of the box with no configuration needed. By default, `optimization_target` is set to `throughput`, which uses static thresholds on queue depth and KV cache utilization — no SLAs or profiling required:
-```bash
+```yaml
-kubectl apply -f components/src/dynamo/profiler/deploy/profile_sla_aic_dgdr.yaml -n $NAMESPACE
+# Minimal planner config — uses throughput optimization by default
+features:
+  planner:
+    mode: disagg
+    backend: vllm
 ```
-See [Planner Guide](planner-guide.md) for the full workflow.
+For latency-sensitive workloads:
-### Load-Based Scaling (without profiling)
-To deploy with load-based scaling only (no profiling required), add these arguments to the planner service in your DGD:
 ```yaml
-args:
+features:
-  - --enable-loadbased-scaling
+  planner:
-  - --disable-throughput-scaling
+    mode: disagg
-  - --loadbased-adjustment-interval=5
+    backend: vllm
+    optimization_target: latency
 ```
-The planner will auto-discover the frontend metrics endpoint from the DGD. See [disagg_planner.yaml](https://github.com/ai-dynamo/dynamo/blob/main/examples/backends/vllm/deploy/disagg_planner.yaml) for a complete example.
+### SLA-Based Scaling (advanced)
-### Manual DGD Deployment
+For precise SLA targeting with pre-deployment profiling, set `optimization_target: sla`:
+```yaml
+features:
+  planner:
+    optimization_target: sla
+    enable_throughput_scaling: true
+    enable_load_scaling: true
+    ttft: 500.0
+    itl: 50.0
+    pre_deployment_sweeping_mode: rapid
+```
-For manual control with throughput-based scaling, use the disaggregated planner templates:
+The fastest path to SLA-based scaling is through a DynamoGraphDeploymentRequest, which automatically profiles your model:
 ```bash
-# After profiling is complete
+kubectl apply -f components/src/dynamo/profiler/deploy/profile_sla_aic_dgdr.yaml -n $NAMESPACE
-kubectl apply -f examples/backends/vllm/deploy/disagg_planner.yaml -n $NAMESPACE
 ```
+See [Planner Guide](planner-guide.md) for the full workflow.
 ## Current Limitations
 ### Load-based scaling
@@ -128,6 +151,7 @@ Load-based scaling has the following known limitations. Throughput-based scaling
 | `--namespace` | `$DYN_NAMESPACE` or `dynamo` | Dynamo logical namespace |
 | `--backend` | `vllm` | Backend framework (`sglang`, `trtllm`, `vllm`) |
 | `--mode` | `disagg` | Planner mode (`disagg`, `prefill`, `decode`, `agg`) |
+| `--optimization-target` | `throughput` | Scaling target: `throughput` (queue/util thresholds), `latency` (aggressive low-latency), `sla` (regression-based SLA targeting) |
 | `--environment` | `kubernetes` | Deployment environment |
 | `--ttft` | `500.0` | Target Time To First Token (ms) |
 | `--itl` | `50.0` | Target Inter-Token Latency (ms) |

--- a/docs/components/planner/planner-guide.md
+++ b/docs/components/planner/planner-guide.md
@@ -10,16 +10,17 @@ For a quick overview, see the [Planner overview](README.md). For architecture in
 ## Scaling Modes
-The planner supports two scaling modes that can be used independently or together:
+The planner supports three optimization targets that determine how scaling decisions are made:
- **Throughput-based scaling** (`enable_throughput_scaling: true`): Uses pre-deployment engine performance data (from self-benchmark or profiler) and traffic prediction to plan capacity. Best for stable, predictable workloads.
+- **`throughput`** (default): Uses static thresholds on queue depth and KV cache utilization. No SLA targets or profiling needed. Works out of the box.
- **Load-based scaling** (`enable_load_scaling: true`): Uses real-time ForwardPassMetrics (FPM) from the Dynamo event plane and online regression to make scaling decisions. Best for bursty or unpredictable traffic. Does not require pre-deployment data.
+- **`latency`**: Same approach as `throughput` but with more aggressive thresholds — scales up earlier and tolerates less queuing. Ideal for latency-sensitive workloads.
+- **`sla`**: Uses regression-based performance models with specific TTFT/ITL targets. Supports both throughput-based (predictive) and load-based (reactive) scaling modes. For advanced users who need precise SLA control.
 **When to use which:**
- Enable **throughput-based scaling** whenever pre-deployment performance data is available (via self-benchmark or profiler). It provides stable, prediction-based capacity planning.
+- Start with **`throughput`** (the default) — it works immediately with no configuration.
- Enable **load-based scaling** when traffic is bursty. It reacts quickly to real-time load changes.
+- Switch to **`latency`** if your workload has strict latency requirements and you prefer to over-provision rather than queue.
- Enable **both** for the best of both worlds: throughput-based provides a capacity floor, load-based handles bursts above it. When both are enabled, use a longer `throughput_adjustment_interval`.
+- Use **`sla`** when you have pre-deployment profiling data and want to target specific TTFT/ITL values.
 ## PlannerConfig Reference
@@ -28,6 +29,17 @@ The planner is configured via a `PlannerConfig` JSON/YAML object. When using the
 ```yaml
 features:
  planner:
+    mode: disagg
+    backend: vllm
+    # optimization_target defaults to "throughput" — works out of the box
+```
+For SLA-based scaling:
+```yaml
+features:
+  planner:
+    optimization_target: sla
    enable_throughput_scaling: true
    enable_load_scaling: false
    pre_deployment_sweeping_mode: rapid
@@ -35,14 +47,22 @@ features:
    backend: vllm
 ```
-### Scaling Mode Fields
+### Optimization Target
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `optimization_target` | string | `throughput` | `throughput`: scale based on queue/utilization thresholds. `latency`: aggressive low-latency thresholds. `sla`: regression-based scaling with ttft/itl targets. |
+When `optimization_target` is `throughput` or `latency`, load-based scaling is automatically enabled and throughput-based scaling is disabled. The `ttft`/`itl` fields are ignored.
+### Scaling Mode Fields (SLA mode)
 | Field | Type | Default | Description |
 |-------|------|---------|-------------|
-| `enable_throughput_scaling` | bool | `true` | Enable throughput-based scaling (requires pre-deployment performance data). |
+| `enable_throughput_scaling` | bool | `true` | Enable throughput-based scaling (requires pre-deployment performance data). Only used when `optimization_target: sla`. |
-| `enable_load_scaling` | bool | `false` | Enable load-based scaling. |
+| `enable_load_scaling` | bool | `false` | Enable load-based scaling. Only used when `optimization_target: sla`. |
-At least one scaling mode must be enabled.
+At least one scaling mode must be enabled when using `optimization_target: sla`.
 ### Pre-Deployment Sweeping