Unverified Commit 9cf9ef18 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat(planner): add optimization_target for easy-mode scaling (#8137)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent 2cabf441
...@@ -55,6 +55,15 @@ class PlannerConfig(BaseModel): ...@@ -55,6 +55,15 @@ class PlannerConfig(BaseModel):
) )
backend: Literal["vllm", "sglang", "trtllm", "mocker"] = SLAPlannerDefaults.backend backend: Literal["vllm", "sglang", "trtllm", "mocker"] = SLAPlannerDefaults.backend
mode: Literal["disagg", "prefill", "decode", "agg"] = SLAPlannerDefaults.mode mode: Literal["disagg", "prefill", "decode", "agg"] = SLAPlannerDefaults.mode
optimization_target: Literal["throughput", "latency", "sla"] = Field(
default="throughput",
description=(
"Scaling optimization target. "
"'throughput' (default) and 'latency' use static thresholds on queue "
"depth and KV cache utilization — no SLA targets or profiling needed. "
"'sla' uses regression-based scaling that targets specific ttft/itl values."
),
)
no_operation: bool = SLAPlannerDefaults.no_operation no_operation: bool = SLAPlannerDefaults.no_operation
log_dir: Optional[str] = SLAPlannerDefaults.log_dir log_dir: Optional[str] = SLAPlannerDefaults.log_dir
...@@ -163,6 +172,20 @@ class PlannerConfig(BaseModel): ...@@ -163,6 +172,20 @@ class PlannerConfig(BaseModel):
"Please specify the namespace where GlobalPlanner is running." "Please specify the namespace where GlobalPlanner is running."
) )
# Easy mode: force load scaling on, throughput scaling off
if self.optimization_target != "sla":
self.enable_load_scaling = True
self.enable_throughput_scaling = False
if (
self.ttft != SLAPlannerDefaults.ttft
or self.itl != SLAPlannerDefaults.itl
):
logger.warning(
"optimization_target=%s ignores ttft/itl values; "
"set optimization_target='sla' to use SLA-based scaling",
self.optimization_target,
)
# At least one scaling mode must be enabled # At least one scaling mode must be enabled
if not self.enable_throughput_scaling and not self.enable_load_scaling: if not self.enable_throughput_scaling and not self.enable_load_scaling:
raise ValueError( raise ValueError(
......
...@@ -77,6 +77,7 @@ def _engine_caps( ...@@ -77,6 +77,7 @@ def _engine_caps(
else None, else None,
max_num_seqs=worker_info.max_num_seqs if worker_info else None, max_num_seqs=worker_info.max_num_seqs if worker_info else None,
context_length=worker_info.context_length if worker_info else None, context_length=worker_info.context_length if worker_info else None,
max_kv_tokens=worker_info.max_kv_tokens if worker_info else None,
) )
......
...@@ -20,6 +20,19 @@ if TYPE_CHECKING: ...@@ -20,6 +20,19 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# -- Easy-mode static thresholds (optimization_target != "sla") -----------
# Prefill: ratio of queued_prefill_tokens / context_length
_PREFILL_THROUGHPUT_SCALE_UP = 1.0 # queued >= context_length
_PREFILL_THROUGHPUT_SCALE_DOWN = 0.1 # queued < context_length / 10
_PREFILL_LATENCY_SCALE_UP = 0.1 # queued >= context_length / 10
_PREFILL_LATENCY_SCALE_DOWN = 0.0 # queued == 0
# Decode/Agg: KV cache utilization (scheduled + queued) / max_kv_tokens
_DECODE_THROUGHPUT_SCALE_UP = 1.0 # util > 100%
_DECODE_THROUGHPUT_SCALE_DOWN = 0.6 # util < 60%
_DECODE_LATENCY_SCALE_UP = 0.4 # util > 40%
_DECODE_LATENCY_SCALE_DOWN = 0.1 # util < 10%
class LoadScalingMixin: class LoadScalingMixin:
"""FPM-driven load-based scaling decisions.""" """FPM-driven load-based scaling decisions."""
...@@ -60,11 +73,19 @@ class LoadScalingMixin: ...@@ -60,11 +73,19 @@ class LoadScalingMixin:
self._diag_load_reason = "worker_count_mismatch" self._diag_load_reason = "worker_count_mismatch"
return None return None
desired = ( easy = self._config.optimization_target != "sla"
self._prefill_load_decision(fpm_stats, num_workers) if easy:
if component == "prefill" desired = (
else self._decode_load_decision(fpm_stats, num_workers) self._prefill_easy_decision(fpm_stats, num_workers)
) if component == "prefill"
else self._decode_easy_decision(fpm_stats, num_workers)
)
else:
desired = (
self._prefill_load_decision(fpm_stats, num_workers)
if component == "prefill"
else self._decode_load_decision(fpm_stats, num_workers)
)
if desired is None: if desired is None:
return None return None
...@@ -113,13 +134,22 @@ class LoadScalingMixin: ...@@ -113,13 +134,22 @@ class LoadScalingMixin:
self._diag_load_reason = "worker_count_mismatch" self._diag_load_reason = "worker_count_mismatch"
return None return None
easy = self._config.optimization_target != "sla"
p_desired = ( p_desired = (
self._prefill_load_decision(p_stats, self._num_p_workers) (
self._prefill_easy_decision(p_stats, self._num_p_workers)
if easy
else self._prefill_load_decision(p_stats, self._num_p_workers)
)
if p_stats if p_stats
else None else None
) )
d_desired = ( d_desired = (
self._decode_load_decision(d_stats, self._num_d_workers) (
self._decode_easy_decision(d_stats, self._num_d_workers)
if easy
else self._decode_load_decision(d_stats, self._num_d_workers)
)
if d_stats if d_stats
else None else None
) )
...@@ -174,6 +204,34 @@ class LoadScalingMixin: ...@@ -174,6 +204,34 @@ class LoadScalingMixin:
if not self._reconcile_fpm_worker_count(fpm_stats, num_workers, "agg"): if not self._reconcile_fpm_worker_count(fpm_stats, num_workers, "agg"):
self._diag_load_reason = "worker_count_mismatch" self._diag_load_reason = "worker_count_mismatch"
return None return None
easy = self._config.optimization_target != "sla"
if easy:
desired = self._agg_easy_decision(fpm_stats, num_workers)
# For agg easy mode, we directly get a single decision
# _agg_easy_decision already sets _diag_load_reason before returning None
if desired is None:
return None
original_desired = desired
desired = max(desired, self._config.min_endpoint)
if self._config.enable_throughput_scaling:
desired = max(desired, self._throughput_lower_bound_d)
desired = self._apply_single_budget(desired, "decode")
if desired < num_workers:
if desired > original_desired:
self._diag_load_reason = "scale_down_capped_by_throughput"
else:
self._diag_load_reason = "scale_down"
elif desired > num_workers:
self._diag_load_reason = "scale_up"
else:
self._diag_load_reason = "no_change"
logger.info(f"Agg easy-mode scaling: {num_workers} -> {desired}")
return ScalingDecision(num_decode=desired)
if not self._agg_regression.has_sufficient_data(): if not self._agg_regression.has_sufficient_data():
logger.info( logger.info(
f"Agg regression: insufficient data " f"Agg regression: insufficient data "
...@@ -358,6 +416,193 @@ class LoadScalingMixin: ...@@ -358,6 +416,193 @@ class LoadScalingMixin:
return self._scale_decision(estimates, self._config.itl, num_workers, "agg ITL") return self._scale_decision(estimates, self._config.itl, num_workers, "agg ITL")
# ------------------------------------------------------------------
# Easy-mode decision methods (optimization_target != "sla")
# ------------------------------------------------------------------
def _prefill_easy_decision(
self, fpm_stats: dict[tuple[str, int], ForwardPassMetrics], num_workers: int
) -> Optional[int]:
p_caps = self._capabilities.prefill
ctx_len = p_caps.context_length if p_caps else None
if not ctx_len or ctx_len <= 0:
logger.warning(
"context_length not available, skipping easy prefill scaling"
)
self._diag_load_reason = "insufficient_data"
return None
if num_workers == 0:
self._diag_load_reason = "insufficient_data"
return None
is_latency = self._config.optimization_target == "latency"
up_thresh = (
_PREFILL_LATENCY_SCALE_UP if is_latency else _PREFILL_THROUGHPUT_SCALE_UP
)
down_thresh = (
_PREFILL_LATENCY_SCALE_DOWN
if is_latency
else _PREFILL_THROUGHPUT_SCALE_DOWN
)
ratios: list[float] = []
for (wid, dp), fpm in fpm_stats.items():
queued = fpm.queued_requests.sum_prefill_tokens
ratio = queued / ctx_len
ratios.append(ratio)
logger.info(
f"Easy prefill {wid}:dp{dp}: queued={queued}, "
f"context_length={ctx_len}, ratio={ratio:.3f}"
)
if not ratios:
self._diag_load_reason = "insufficient_data"
return None
# Scale up if ANY engine above threshold
if any(r >= up_thresh for r in ratios):
logger.info(
f"Easy prefill: engine(s) above scale-up threshold "
f"({up_thresh}), scaling up to {num_workers + 1}"
)
return num_workers + 1
# Scale down if ALL engines below threshold
if num_workers > 1:
if is_latency:
# For latency mode, scale down when ALL queues are empty
if all(r <= down_thresh for r in ratios):
desired = max(num_workers - 1, self._config.min_endpoint)
logger.info(
f"Easy prefill: all engines at zero queue, -> {desired}"
)
return desired
else:
if all(r < down_thresh for r in ratios):
desired = max(num_workers - 1, self._config.min_endpoint)
logger.info(
f"Easy prefill: all engines below scale-down threshold "
f"({down_thresh}), -> {desired}"
)
return desired
self._diag_load_reason = "no_change"
return None
def _decode_easy_decision(
self, fpm_stats: dict[tuple[str, int], ForwardPassMetrics], num_workers: int
) -> Optional[int]:
d_caps = self._capabilities.decode
max_kv = d_caps.max_kv_tokens if d_caps else None
if not max_kv or max_kv <= 0:
logger.warning("max_kv_tokens not available, skipping easy decode scaling")
self._diag_load_reason = "insufficient_data"
return None
if num_workers == 0:
self._diag_load_reason = "insufficient_data"
return None
is_latency = self._config.optimization_target == "latency"
up_thresh = (
_DECODE_LATENCY_SCALE_UP if is_latency else _DECODE_THROUGHPUT_SCALE_UP
)
down_thresh = (
_DECODE_LATENCY_SCALE_DOWN if is_latency else _DECODE_THROUGHPUT_SCALE_DOWN
)
utils: list[float] = []
for (wid, dp), fpm in fpm_stats.items():
sched_kv = fpm.scheduled_requests.sum_decode_kv_tokens
queued_kv = fpm.queued_requests.sum_decode_kv_tokens
util = (sched_kv + queued_kv) / max_kv
utils.append(util)
logger.info(
f"Easy decode {wid}:dp{dp}: sched_kv={sched_kv}, "
f"queued_kv={queued_kv}, max_kv={max_kv}, util={util:.3f}"
)
if not utils:
self._diag_load_reason = "insufficient_data"
return None
if any(u > up_thresh for u in utils):
logger.info(
f"Easy decode: engine(s) above scale-up threshold "
f"({up_thresh}), scaling up to {num_workers + 1}"
)
return num_workers + 1
if num_workers > 1 and all(u < down_thresh for u in utils):
desired = max(num_workers - 1, self._config.min_endpoint)
logger.info(
f"Easy decode: all engines below scale-down threshold "
f"({down_thresh}), -> {desired}"
)
return desired
self._diag_load_reason = "no_change"
return None
def _agg_easy_decision(
self, fpm_stats: dict[tuple[str, int], ForwardPassMetrics], num_workers: int
) -> Optional[int]:
"""Easy-mode decision for agg: uses combined KV utilization including queued prefill."""
d_caps = self._capabilities.decode
max_kv = d_caps.max_kv_tokens if d_caps else None
if not max_kv or max_kv <= 0:
logger.warning("max_kv_tokens not available, skipping easy agg scaling")
self._diag_load_reason = "insufficient_data"
return None
if num_workers == 0:
self._diag_load_reason = "insufficient_data"
return None
is_latency = self._config.optimization_target == "latency"
up_thresh = (
_DECODE_LATENCY_SCALE_UP if is_latency else _DECODE_THROUGHPUT_SCALE_UP
)
down_thresh = (
_DECODE_LATENCY_SCALE_DOWN if is_latency else _DECODE_THROUGHPUT_SCALE_DOWN
)
utils: list[float] = []
for (wid, dp), fpm in fpm_stats.items():
sched_kv = fpm.scheduled_requests.sum_decode_kv_tokens
queued_kv = fpm.queued_requests.sum_decode_kv_tokens
queued_prefill = fpm.queued_requests.sum_prefill_tokens
util = (sched_kv + queued_kv + queued_prefill) / max_kv
utils.append(util)
logger.info(
f"Easy agg {wid}:dp{dp}: sched_kv={sched_kv}, queued_kv={queued_kv}, "
f"queued_prefill={queued_prefill}, max_kv={max_kv}, util={util:.3f}"
)
if not utils:
self._diag_load_reason = "insufficient_data"
return None
if any(u > up_thresh for u in utils):
logger.info(
f"Easy agg: engine(s) above scale-up threshold "
f"({up_thresh}), scaling up to {num_workers + 1}"
)
return num_workers + 1
if num_workers > 1 and all(u < down_thresh for u in utils):
desired = max(num_workers - 1, self._config.min_endpoint)
logger.info(
f"Easy agg: all engines below scale-down threshold "
f"({down_thresh}), -> {desired}"
)
return desired
self._diag_load_reason = "no_change"
return None
# ------------------------------------------------------------------
# SLA-based per-engine latency estimation
# ------------------------------------------------------------------
def _scale_decision( def _scale_decision(
self, estimates: list[float], sla: float, num_workers: int, label: str self, estimates: list[float], sla: float, num_workers: int, label: str
) -> Optional[int]: ) -> Optional[int]:
......
...@@ -66,31 +66,34 @@ class PlannerStateMachine(LoadScalingMixin, ThroughputScalingMixin): ...@@ -66,31 +66,34 @@ class PlannerStateMachine(LoadScalingMixin, ThroughputScalingMixin):
self._is_agg = config.mode == "agg" self._is_agg = config.mode == "agg"
self._has_prefill = config.mode in ("disagg", "prefill") self._has_prefill = config.mode in ("disagg", "prefill")
self._has_decode = config.mode in ("disagg", "decode", "agg") self._has_decode = config.mode in ("disagg", "decode", "agg")
self._is_easy = config.optimization_target != "sla"
if self._is_agg: # Easy mode uses static thresholds -- no regression or predictors needed
self._agg_regression = AggRegressionModel( if not self._is_easy:
max_num_fpm_samples=config.max_num_fpm_samples, if self._is_agg:
min_observations=config.load_min_observations, self._agg_regression = AggRegressionModel(
bucket_count=config.fpm_sample_bucket_size,
)
else:
if self._has_prefill:
self._prefill_regression = PrefillRegressionModel(
max_num_fpm_samples=config.max_num_fpm_samples, max_num_fpm_samples=config.max_num_fpm_samples,
min_observations=config.load_min_observations, min_observations=config.load_min_observations,
bucket_count=config.fpm_sample_bucket_size, bucket_count=config.fpm_sample_bucket_size,
) )
if self._has_decode: else:
self._decode_regression = DecodeRegressionModel( if self._has_prefill:
max_num_fpm_samples=config.max_num_fpm_samples, self._prefill_regression = PrefillRegressionModel(
min_observations=config.load_min_observations, max_num_fpm_samples=config.max_num_fpm_samples,
bucket_count=config.fpm_sample_bucket_size, min_observations=config.load_min_observations,
) bucket_count=config.fpm_sample_bucket_size,
)
predictor_cls = LOAD_PREDICTORS[config.load_predictor] if self._has_decode:
self._num_req_predictor = predictor_cls(config) self._decode_regression = DecodeRegressionModel(
self._isl_predictor = predictor_cls(config) max_num_fpm_samples=config.max_num_fpm_samples,
self._osl_predictor = predictor_cls(config) min_observations=config.load_min_observations,
bucket_count=config.fpm_sample_bucket_size,
)
predictor_cls = LOAD_PREDICTORS[config.load_predictor]
self._num_req_predictor = predictor_cls(config)
self._isl_predictor = predictor_cls(config)
self._osl_predictor = predictor_cls(config)
self._num_p_workers: int = 0 self._num_p_workers: int = 0
self._num_d_workers: int = 0 self._num_d_workers: int = 0
...@@ -132,6 +135,9 @@ class PlannerStateMachine(LoadScalingMixin, ThroughputScalingMixin): ...@@ -132,6 +135,9 @@ class PlannerStateMachine(LoadScalingMixin, ThroughputScalingMixin):
decode_fpms: Optional[list[ForwardPassMetrics]] = None, decode_fpms: Optional[list[ForwardPassMetrics]] = None,
agg_fpms: Optional[list[ForwardPassMetrics]] = None, agg_fpms: Optional[list[ForwardPassMetrics]] = None,
) -> None: ) -> None:
if self._is_easy:
logger.debug("Skipping benchmark FPM loading in easy mode")
return
if agg_fpms and self._is_agg: if agg_fpms and self._is_agg:
self._agg_regression.load_benchmark_fpms(agg_fpms) self._agg_regression.load_benchmark_fpms(agg_fpms)
logger.info(f"Bootstrapped agg regression with {len(agg_fpms)} FPMs") logger.info(f"Bootstrapped agg regression with {len(agg_fpms)} FPMs")
...@@ -145,6 +151,9 @@ class PlannerStateMachine(LoadScalingMixin, ThroughputScalingMixin): ...@@ -145,6 +151,9 @@ class PlannerStateMachine(LoadScalingMixin, ThroughputScalingMixin):
logger.info(f"Bootstrapped decode regression with {len(decode_fpms)} FPMs") logger.info(f"Bootstrapped decode regression with {len(decode_fpms)} FPMs")
def warm_load_predictors(self, observations: list[TrafficObservation]) -> None: def warm_load_predictors(self, observations: list[TrafficObservation]) -> None:
if self._is_easy:
logger.debug("Skipping load predictor warmup in easy mode")
return
for obs in observations: for obs in observations:
self._num_req_predictor.add_data_point(obs.num_req) self._num_req_predictor.add_data_point(obs.num_req)
self._isl_predictor.add_data_point(obs.isl) self._isl_predictor.add_data_point(obs.isl)
...@@ -163,7 +172,8 @@ class PlannerStateMachine(LoadScalingMixin, ThroughputScalingMixin): ...@@ -163,7 +172,8 @@ class PlannerStateMachine(LoadScalingMixin, ThroughputScalingMixin):
if tick.run_load_scaling: if tick.run_load_scaling:
if tick_input.fpm_observations is not None: if tick_input.fpm_observations is not None:
self._observe_fpm(tick_input.fpm_observations) if not self._is_easy:
self._observe_fpm(tick_input.fpm_observations)
load_decision = self._advance_load(tick_input.fpm_observations) load_decision = self._advance_load(tick_input.fpm_observations)
if load_decision is not None: if load_decision is not None:
effects.scale_to = load_decision effects.scale_to = load_decision
......
...@@ -134,6 +134,7 @@ class EngineCapabilities: ...@@ -134,6 +134,7 @@ class EngineCapabilities:
max_num_batched_tokens: Optional[int] = None max_num_batched_tokens: Optional[int] = None
max_num_seqs: Optional[int] = None max_num_seqs: Optional[int] = None
context_length: Optional[int] = None context_length: Optional[int] = None
max_kv_tokens: Optional[int] = None
@dataclass @dataclass
......
This diff is collapsed.
...@@ -114,6 +114,7 @@ def test_agg_mode_supports_throughput_scaling(): ...@@ -114,6 +114,7 @@ def test_agg_mode_supports_throughput_scaling():
config = PlannerConfig( config = PlannerConfig(
namespace="test-ns", namespace="test-ns",
mode="agg", mode="agg",
optimization_target="sla",
enable_throughput_scaling=True, enable_throughput_scaling=True,
enable_load_scaling=False, enable_load_scaling=False,
) )
......
...@@ -85,6 +85,7 @@ def _make_fpm( ...@@ -85,6 +85,7 @@ def _make_fpm(
def _make_config(**overrides) -> PlannerConfig: def _make_config(**overrides) -> PlannerConfig:
defaults = dict( defaults = dict(
mode="disagg", mode="disagg",
optimization_target="sla",
ttft=500.0, ttft=500.0,
itl=50.0, itl=50.0,
min_endpoint=1, min_endpoint=1,
......
...@@ -13,6 +13,7 @@ sla: ...@@ -13,6 +13,7 @@ sla:
itl: 50.0 itl: 50.0
features: features:
planner: planner:
optimization_target: sla
pre_deployment_sweeping_mode: rapid pre_deployment_sweeping_mode: rapid
enable_throughput_scaling: true enable_throughput_scaling: true
enable_load_scaling: false enable_load_scaling: false
......
...@@ -13,6 +13,7 @@ sla: ...@@ -13,6 +13,7 @@ sla:
itl: 50.0 itl: 50.0
features: features:
planner: planner:
optimization_target: sla
pre_deployment_sweeping_mode: rapid pre_deployment_sweeping_mode: rapid
enable_throughput_scaling: true enable_throughput_scaling: true
enable_load_scaling: false enable_load_scaling: false
......
...@@ -15,6 +15,7 @@ sla: ...@@ -15,6 +15,7 @@ sla:
itl: 50.0 itl: 50.0
features: features:
planner: planner:
optimization_target: sla
pre_deployment_sweeping_mode: rapid pre_deployment_sweeping_mode: rapid
enable_throughput_scaling: true enable_throughput_scaling: true
enable_load_scaling: false enable_load_scaling: false
......
...@@ -17,6 +17,7 @@ sla: ...@@ -17,6 +17,7 @@ sla:
searchStrategy: thorough searchStrategy: thorough
features: features:
planner: planner:
optimization_target: sla
pre_deployment_sweeping_mode: rapid pre_deployment_sweeping_mode: rapid
enable_throughput_scaling: true enable_throughput_scaling: true
enable_load_scaling: false enable_load_scaling: false
......
...@@ -15,6 +15,7 @@ sla: ...@@ -15,6 +15,7 @@ sla:
searchStrategy: thorough searchStrategy: thorough
features: features:
planner: planner:
optimization_target: sla
pre_deployment_sweeping_mode: thorough pre_deployment_sweeping_mode: thorough
enable_throughput_scaling: true enable_throughput_scaling: true
enable_load_scaling: false enable_load_scaling: false
......
...@@ -77,6 +77,7 @@ def _make_dgdr(**overrides) -> DynamoGraphDeploymentRequestSpec: ...@@ -77,6 +77,7 @@ def _make_dgdr(**overrides) -> DynamoGraphDeploymentRequestSpec:
def _make_planner(**overrides) -> PlannerConfig: def _make_planner(**overrides) -> PlannerConfig:
base = dict( base = dict(
optimization_target="sla",
enable_throughput_scaling=True, enable_throughput_scaling=True,
enable_load_scaling=False, enable_load_scaling=False,
pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid, pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid,
......
...@@ -17,6 +17,18 @@ LLM inference breaks these assumptions: ...@@ -17,6 +17,18 @@ LLM inference breaks these assumptions:
The Dynamo **Planner** is an autoscaler purpose-built for these constraints. It understands engine profiling data, tracks per-worker GPU utilization, predicts traffic patterns, and makes scaling decisions that directly target TTFT and ITL SLAs — not proxy metrics. The Dynamo **Planner** is an autoscaler purpose-built for these constraints. It understands engine profiling data, tracks per-worker GPU utilization, predicts traffic patterns, and makes scaling decisions that directly target TTFT and ITL SLAs — not proxy metrics.
## Getting Started: Optimization Targets
The planner offers three `optimization_target` settings that control how scaling decisions are made:
| Target | Description | Requires SLA? | Requires Profiling? |
|--------|-------------|:-------------:|:-------------------:|
| **`throughput`** (default) | Maximizes throughput by scaling based on queue depth and KV cache utilization. Scales up when engines are saturated, scales down when utilization drops. | No | No |
| **`latency`** | Minimizes latency by scaling aggressively to keep queues short. Scales up at lower utilization thresholds. | No | No |
| **`sla`** | Targets specific TTFT/ITL SLA values using regression-based performance models. Most precise, but requires configuration. | Yes (`ttft`, `itl`) | Recommended |
**We recommend starting with the default `throughput` target** — it works out of the box with zero configuration. Switch to `latency` if your workload is latency-sensitive, or to `sla` when you need precise SLA targeting with pre-deployment profiling.
> **New to the Planner?** Start with the [Planner Guide](planner-guide.md) for a complete workflow including profiling and deployment. > **New to the Planner?** Start with the [Planner Guide](planner-guide.md) for a complete workflow including profiling and deployment.
> **Need multi-DGD coordination?** See the [Global Planner Guide](global-planner.md) for shared-policy coordination across multiple DGDs and single-endpoint multi-pool deployments. > **Need multi-DGD coordination?** See the [Global Planner Guide](global-planner.md) for shared-policy coordination across multiple DGDs and single-endpoint multi-pool deployments.
...@@ -63,40 +75,51 @@ When both modes are enabled, throughput-based scaling provides a capacity floor ...@@ -63,40 +75,51 @@ When both modes are enabled, throughput-based scaling provides a capacity floor
- Dynamo platform installed on Kubernetes ([Installation Guide](../../kubernetes/installation-guide.md)) - Dynamo platform installed on Kubernetes ([Installation Guide](../../kubernetes/installation-guide.md))
- kube-prometheus-stack installed ([Metrics Setup](../../kubernetes/observability/metrics.md)) - kube-prometheus-stack installed ([Metrics Setup](../../kubernetes/observability/metrics.md))
For throughput-based scaling, pre-deployment engine performance data is also required (via self-benchmark mode or [Profiling Guide](../profiler/profiler-guide.md)). ### Default Mode (zero config)
### Throughput-Based Scaling (with DGDR)
The fastest path to a throughput-based planner deployment is through a DynamoGraphDeploymentRequest, which automatically profiles your model: The planner works out of the box with no configuration needed. By default, `optimization_target` is set to `throughput`, which uses static thresholds on queue depth and KV cache utilization — no SLAs or profiling required:
```bash ```yaml
kubectl apply -f components/src/dynamo/profiler/deploy/profile_sla_aic_dgdr.yaml -n $NAMESPACE # Minimal planner config — uses throughput optimization by default
features:
planner:
mode: disagg
backend: vllm
``` ```
See [Planner Guide](planner-guide.md) for the full workflow. For latency-sensitive workloads:
### Load-Based Scaling (without profiling)
To deploy with load-based scaling only (no profiling required), add these arguments to the planner service in your DGD:
```yaml ```yaml
args: features:
- --enable-loadbased-scaling planner:
- --disable-throughput-scaling mode: disagg
- --loadbased-adjustment-interval=5 backend: vllm
optimization_target: latency
``` ```
The planner will auto-discover the frontend metrics endpoint from the DGD. See [disagg_planner.yaml](https://github.com/ai-dynamo/dynamo/blob/main/examples/backends/vllm/deploy/disagg_planner.yaml) for a complete example. ### SLA-Based Scaling (advanced)
### Manual DGD Deployment For precise SLA targeting with pre-deployment profiling, set `optimization_target: sla`:
```yaml
features:
planner:
optimization_target: sla
enable_throughput_scaling: true
enable_load_scaling: true
ttft: 500.0
itl: 50.0
pre_deployment_sweeping_mode: rapid
```
For manual control with throughput-based scaling, use the disaggregated planner templates: The fastest path to SLA-based scaling is through a DynamoGraphDeploymentRequest, which automatically profiles your model:
```bash ```bash
# After profiling is complete kubectl apply -f components/src/dynamo/profiler/deploy/profile_sla_aic_dgdr.yaml -n $NAMESPACE
kubectl apply -f examples/backends/vllm/deploy/disagg_planner.yaml -n $NAMESPACE
``` ```
See [Planner Guide](planner-guide.md) for the full workflow.
## Current Limitations ## Current Limitations
### Load-based scaling ### Load-based scaling
...@@ -128,6 +151,7 @@ Load-based scaling has the following known limitations. Throughput-based scaling ...@@ -128,6 +151,7 @@ Load-based scaling has the following known limitations. Throughput-based scaling
| `--namespace` | `$DYN_NAMESPACE` or `dynamo` | Dynamo logical namespace | | `--namespace` | `$DYN_NAMESPACE` or `dynamo` | Dynamo logical namespace |
| `--backend` | `vllm` | Backend framework (`sglang`, `trtllm`, `vllm`) | | `--backend` | `vllm` | Backend framework (`sglang`, `trtllm`, `vllm`) |
| `--mode` | `disagg` | Planner mode (`disagg`, `prefill`, `decode`, `agg`) | | `--mode` | `disagg` | Planner mode (`disagg`, `prefill`, `decode`, `agg`) |
| `--optimization-target` | `throughput` | Scaling target: `throughput` (queue/util thresholds), `latency` (aggressive low-latency), `sla` (regression-based SLA targeting) |
| `--environment` | `kubernetes` | Deployment environment | | `--environment` | `kubernetes` | Deployment environment |
| `--ttft` | `500.0` | Target Time To First Token (ms) | | `--ttft` | `500.0` | Target Time To First Token (ms) |
| `--itl` | `50.0` | Target Inter-Token Latency (ms) | | `--itl` | `50.0` | Target Inter-Token Latency (ms) |
......
...@@ -10,16 +10,17 @@ For a quick overview, see the [Planner overview](README.md). For architecture in ...@@ -10,16 +10,17 @@ For a quick overview, see the [Planner overview](README.md). For architecture in
## Scaling Modes ## Scaling Modes
The planner supports two scaling modes that can be used independently or together: The planner supports three optimization targets that determine how scaling decisions are made:
- **Throughput-based scaling** (`enable_throughput_scaling: true`): Uses pre-deployment engine performance data (from self-benchmark or profiler) and traffic prediction to plan capacity. Best for stable, predictable workloads. - **`throughput`** (default): Uses static thresholds on queue depth and KV cache utilization. No SLA targets or profiling needed. Works out of the box.
- **Load-based scaling** (`enable_load_scaling: true`): Uses real-time ForwardPassMetrics (FPM) from the Dynamo event plane and online regression to make scaling decisions. Best for bursty or unpredictable traffic. Does not require pre-deployment data. - **`latency`**: Same approach as `throughput` but with more aggressive thresholds — scales up earlier and tolerates less queuing. Ideal for latency-sensitive workloads.
- **`sla`**: Uses regression-based performance models with specific TTFT/ITL targets. Supports both throughput-based (predictive) and load-based (reactive) scaling modes. For advanced users who need precise SLA control.
**When to use which:** **When to use which:**
- Enable **throughput-based scaling** whenever pre-deployment performance data is available (via self-benchmark or profiler). It provides stable, prediction-based capacity planning. - Start with **`throughput`** (the default) — it works immediately with no configuration.
- Enable **load-based scaling** when traffic is bursty. It reacts quickly to real-time load changes. - Switch to **`latency`** if your workload has strict latency requirements and you prefer to over-provision rather than queue.
- Enable **both** for the best of both worlds: throughput-based provides a capacity floor, load-based handles bursts above it. When both are enabled, use a longer `throughput_adjustment_interval`. - Use **`sla`** when you have pre-deployment profiling data and want to target specific TTFT/ITL values.
## PlannerConfig Reference ## PlannerConfig Reference
...@@ -28,6 +29,17 @@ The planner is configured via a `PlannerConfig` JSON/YAML object. When using the ...@@ -28,6 +29,17 @@ The planner is configured via a `PlannerConfig` JSON/YAML object. When using the
```yaml ```yaml
features: features:
planner: planner:
mode: disagg
backend: vllm
# optimization_target defaults to "throughput" — works out of the box
```
For SLA-based scaling:
```yaml
features:
planner:
optimization_target: sla
enable_throughput_scaling: true enable_throughput_scaling: true
enable_load_scaling: false enable_load_scaling: false
pre_deployment_sweeping_mode: rapid pre_deployment_sweeping_mode: rapid
...@@ -35,14 +47,22 @@ features: ...@@ -35,14 +47,22 @@ features:
backend: vllm backend: vllm
``` ```
### Scaling Mode Fields ### Optimization Target
| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `optimization_target` | string | `throughput` | `throughput`: scale based on queue/utilization thresholds. `latency`: aggressive low-latency thresholds. `sla`: regression-based scaling with ttft/itl targets. |
When `optimization_target` is `throughput` or `latency`, load-based scaling is automatically enabled and throughput-based scaling is disabled. The `ttft`/`itl` fields are ignored.
### Scaling Mode Fields (SLA mode)
| Field | Type | Default | Description | | Field | Type | Default | Description |
|-------|------|---------|-------------| |-------|------|---------|-------------|
| `enable_throughput_scaling` | bool | `true` | Enable throughput-based scaling (requires pre-deployment performance data). | | `enable_throughput_scaling` | bool | `true` | Enable throughput-based scaling (requires pre-deployment performance data). Only used when `optimization_target: sla`. |
| `enable_load_scaling` | bool | `false` | Enable load-based scaling. | | `enable_load_scaling` | bool | `false` | Enable load-based scaling. Only used when `optimization_target: sla`. |
At least one scaling mode must be enabled. At least one scaling mode must be enabled when using `optimization_target: sla`.
### Pre-Deployment Sweeping ### Pre-Deployment Sweeping
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment