Unverified Commit f1e8ea6e authored by Karen Chung's avatar Karen Chung Committed by GitHub
Browse files

fix: Enforce min_endpoint flag in Planner (#6637)

parent 0853129a
...@@ -210,7 +210,7 @@ class AggPlanner: ...@@ -210,7 +210,7 @@ class AggPlanner:
return "up" return "up"
# Scale down: ALL workers below boundary # Scale down: ALL workers below boundary
if num_workers > 1: if num_workers > self.config.min_endpoint:
sensitivity = self.config.load_scaling_down_sensitivity / 100.0 sensitivity = self.config.load_scaling_down_sensitivity / 100.0
boundary = target * (num_workers - 1) / num_workers * sensitivity boundary = target * (num_workers - 1) / num_workers * sensitivity
if all( if all(
...@@ -253,7 +253,7 @@ class AggPlanner: ...@@ -253,7 +253,7 @@ class AggPlanner:
# Scale down: ALL workers below boundary # Scale down: ALL workers below boundary
# TODO: should we strictly enforce all workers below boundary? # TODO: should we strictly enforce all workers below boundary?
# how about user-configurable percentage? # how about user-configurable percentage?
if num_workers > 1: if num_workers > self.config.min_endpoint:
sensitivity = self.config.load_scaling_down_sensitivity / 100.0 sensitivity = self.config.load_scaling_down_sensitivity / 100.0
boundary = x_sla * (num_workers - 1) / num_workers * sensitivity boundary = x_sla * (num_workers - 1) / num_workers * sensitivity
if all( if all(
......
...@@ -69,6 +69,13 @@ class DecodePlanner(BasePlanner): ...@@ -69,6 +69,13 @@ class DecodePlanner(BasePlanner):
m.get("active_decode_blocks", 0.0) < boundary for m in recent.values() m.get("active_decode_blocks", 0.0) < boundary for m in recent.values()
) )
if all_below: if all_below:
if num_workers - 1 < self.config.min_endpoint:
logger.info(
f"Load-based decode: ALL workers below boundary ({boundary:.1f}), "
f"but cannot scale down below min_endpoint ({self.config.min_endpoint}); "
f"maintaining {num_workers} decode workers"
)
return num_workers
logger.info( logger.info(
f"Load-based decode: ALL workers below boundary ({boundary:.1f}), " f"Load-based decode: ALL workers below boundary ({boundary:.1f}), "
f"scaling down to {num_workers - 1}" f"scaling down to {num_workers - 1}"
......
...@@ -226,6 +226,10 @@ class DisaggPlanner: ...@@ -226,6 +226,10 @@ class DisaggPlanner:
final_p = max(final_p, self.shared_state.throughput_lower_bound_p) final_p = max(final_p, self.shared_state.throughput_lower_bound_p)
final_d = max(final_d, self.shared_state.throughput_lower_bound_d) final_d = max(final_d, self.shared_state.throughput_lower_bound_d)
# Enforce minimum endpoints
final_p = max(final_p, self.config.min_endpoint)
final_d = max(final_d, self.config.min_endpoint)
# Apply GPU budget # Apply GPU budget
final_p, final_d = _apply_global_gpu_budget(final_p, final_d, self.config) final_p, final_d = _apply_global_gpu_budget(final_p, final_d, self.config)
......
...@@ -746,7 +746,9 @@ class BasePlanner: ...@@ -746,7 +746,9 @@ class BasePlanner:
def apply_component_budget(self, desired_replicas: int) -> int: def apply_component_budget(self, desired_replicas: int) -> int:
return _apply_component_gpu_budget( return _apply_component_gpu_budget(
desired_replicas, self._engine_num_gpu(), self.config max(desired_replicas, self.config.min_endpoint),
self._engine_num_gpu(),
self.config,
) )
async def _apply_scaling(self, desired_replicas: int) -> None: async def _apply_scaling(self, desired_replicas: int) -> None:
......
...@@ -78,6 +78,13 @@ class PrefillPlanner(BasePlanner): ...@@ -78,6 +78,13 @@ class PrefillPlanner(BasePlanner):
m.get("active_prefill_tokens", 0.0) < boundary for m in recent.values() m.get("active_prefill_tokens", 0.0) < boundary for m in recent.values()
) )
if all_below: if all_below:
if num_workers - 1 < self.config.min_endpoint:
logger.info(
f"Load-based prefill: ALL workers below boundary ({boundary:.1f}), "
f"but cannot scale down below min_endpoint ({self.config.min_endpoint}); "
f"maintaining {num_workers} prefill workers"
)
return num_workers
logger.info( logger.info(
f"Load-based prefill: ALL workers below boundary ({boundary:.1f}), " f"Load-based prefill: ALL workers below boundary ({boundary:.1f}), "
f"scaling down to {num_workers - 1}" f"scaling down to {num_workers - 1}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment