fix: Enforce min_endpoint flag in Planner (#6637)

f1e8ea6e · Karen Chung · GitHub · 0853129a · f1e8ea6e · f1e8ea6e
Unverified Commit f1e8ea6e authored Mar 05, 2026 by Karen Chung Committed by GitHub Mar 05, 2026
5 changed files
--- a/components/src/dynamo/planner/utils/agg_planner.py
+++ b/components/src/dynamo/planner/utils/agg_planner.py
@@ -210,7 +210,7 @@ class AggPlanner:
            return "up"

        # Scale down: ALL workers below boundary
-        if num_workers > 1:
+        if num_workers > self.config.min_endpoint:
            sensitivity = self.config.load_scaling_down_sensitivity / 100.0
            boundary = target * (num_workers - 1) / num_workers * sensitivity
            if all(
@@ -253,7 +253,7 @@ class AggPlanner:
        # Scale down: ALL workers below boundary
        # TODO: should we strictly enforce all workers below boundary?
        # how about user-configurable percentage?
-        if num_workers > 1:
+        if num_workers > self.config.min_endpoint:
            sensitivity = self.config.load_scaling_down_sensitivity / 100.0
            boundary = x_sla * (num_workers - 1) / num_workers * sensitivity
            if all(

--- a/components/src/dynamo/planner/utils/decode_planner.py
+++ b/components/src/dynamo/planner/utils/decode_planner.py
@@ -69,6 +69,13 @@ class DecodePlanner(BasePlanner):
                m.get("active_decode_blocks", 0.0) < boundary for m in recent.values()
            )
            if all_below:
+                if num_workers - 1 < self.config.min_endpoint:
+                    logger.info(
+                        f"Load-based decode: ALL workers below boundary ({boundary:.1f}), "
+                        f"but cannot scale down below min_endpoint ({self.config.min_endpoint}); "
+                        f"maintaining {num_workers} decode workers"
+                    )
+                    return num_workers
                logger.info(
                    f"Load-based decode: ALL workers below boundary ({boundary:.1f}), "
                    f"scaling down to {num_workers - 1}"

--- a/components/src/dynamo/planner/utils/disagg_planner.py
+++ b/components/src/dynamo/planner/utils/disagg_planner.py
@@ -226,6 +226,10 @@ class DisaggPlanner:
                final_p = max(final_p, self.shared_state.throughput_lower_bound_p)
                final_d = max(final_d, self.shared_state.throughput_lower_bound_d)

+            # Enforce minimum endpoints
+            final_p = max(final_p, self.config.min_endpoint)
+            final_d = max(final_d, self.config.min_endpoint)
+
            # Apply GPU budget
            final_p, final_d = _apply_global_gpu_budget(final_p, final_d, self.config)


--- a/components/src/dynamo/planner/utils/planner_core.py
+++ b/components/src/dynamo/planner/utils/planner_core.py
@@ -746,7 +746,9 @@ class BasePlanner:

    def apply_component_budget(self, desired_replicas: int) -> int:
        return _apply_component_gpu_budget(
-            desired_replicas, self._engine_num_gpu(), self.config
+            max(desired_replicas, self.config.min_endpoint),
+            self._engine_num_gpu(),
+            self.config,
        )

    async def _apply_scaling(self, desired_replicas: int) -> None:

--- a/components/src/dynamo/planner/utils/prefill_planner.py
+++ b/components/src/dynamo/planner/utils/prefill_planner.py
@@ -78,6 +78,13 @@ class PrefillPlanner(BasePlanner):
                m.get("active_prefill_tokens", 0.0) < boundary for m in recent.values()
            )
            if all_below:
+                if num_workers - 1 < self.config.min_endpoint:
+                    logger.info(
+                        f"Load-based prefill: ALL workers below boundary ({boundary:.1f}), "
+                        f"but cannot scale down below min_endpoint ({self.config.min_endpoint}); "
+                        f"maintaining {num_workers} prefill workers"
+                    )
+                    return num_workers
                logger.info(
                    f"Load-based prefill: ALL workers below boundary ({boundary:.1f}), "
                    f"scaling down to {num_workers - 1}"