Unverified Commit f1e8ea6e authored by Karen Chung's avatar Karen Chung Committed by GitHub
Browse files

fix: Enforce min_endpoint flag in Planner (#6637)

parent 0853129a
......@@ -210,7 +210,7 @@ class AggPlanner:
return "up"
# Scale down: ALL workers below boundary
if num_workers > 1:
if num_workers > self.config.min_endpoint:
sensitivity = self.config.load_scaling_down_sensitivity / 100.0
boundary = target * (num_workers - 1) / num_workers * sensitivity
if all(
......@@ -253,7 +253,7 @@ class AggPlanner:
# Scale down: ALL workers below boundary
# TODO: should we strictly enforce all workers below boundary?
# how about user-configurable percentage?
if num_workers > 1:
if num_workers > self.config.min_endpoint:
sensitivity = self.config.load_scaling_down_sensitivity / 100.0
boundary = x_sla * (num_workers - 1) / num_workers * sensitivity
if all(
......
......@@ -69,6 +69,13 @@ class DecodePlanner(BasePlanner):
m.get("active_decode_blocks", 0.0) < boundary for m in recent.values()
)
if all_below:
if num_workers - 1 < self.config.min_endpoint:
logger.info(
f"Load-based decode: ALL workers below boundary ({boundary:.1f}), "
f"but cannot scale down below min_endpoint ({self.config.min_endpoint}); "
f"maintaining {num_workers} decode workers"
)
return num_workers
logger.info(
f"Load-based decode: ALL workers below boundary ({boundary:.1f}), "
f"scaling down to {num_workers - 1}"
......
......@@ -226,6 +226,10 @@ class DisaggPlanner:
final_p = max(final_p, self.shared_state.throughput_lower_bound_p)
final_d = max(final_d, self.shared_state.throughput_lower_bound_d)
# Enforce minimum endpoints
final_p = max(final_p, self.config.min_endpoint)
final_d = max(final_d, self.config.min_endpoint)
# Apply GPU budget
final_p, final_d = _apply_global_gpu_budget(final_p, final_d, self.config)
......
......@@ -746,7 +746,9 @@ class BasePlanner:
def apply_component_budget(self, desired_replicas: int) -> int:
return _apply_component_gpu_budget(
desired_replicas, self._engine_num_gpu(), self.config
max(desired_replicas, self.config.min_endpoint),
self._engine_num_gpu(),
self.config,
)
async def _apply_scaling(self, desired_replicas: int) -> None:
......
......@@ -78,6 +78,13 @@ class PrefillPlanner(BasePlanner):
m.get("active_prefill_tokens", 0.0) < boundary for m in recent.values()
)
if all_below:
if num_workers - 1 < self.config.min_endpoint:
logger.info(
f"Load-based prefill: ALL workers below boundary ({boundary:.1f}), "
f"but cannot scale down below min_endpoint ({self.config.min_endpoint}); "
f"maintaining {num_workers} prefill workers"
)
return num_workers
logger.info(
f"Load-based prefill: ALL workers below boundary ({boundary:.1f}), "
f"scaling down to {num_workers - 1}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment