Unverified Commit d185c881 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

fix(planner): don't block agg decode scaling when max_num_batched_tokens is missing (#8196)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent af0ff07c
......@@ -243,11 +243,12 @@ class LoadScalingMixin:
d_caps = self._capabilities.decode
max_tokens = d_caps.max_num_batched_tokens if d_caps else None
if not max_tokens or max_tokens <= 0:
logger.warning("max_num_batched_tokens not available, skipping agg scaling")
self._diag_load_reason = "insufficient_data"
return None
p_desired = self._agg_prefill_scaling(fpm_stats, num_workers, max_tokens)
logger.warning(
"max_num_batched_tokens not available, skipping agg prefill scaling"
)
p_desired = None
else:
p_desired = self._agg_prefill_scaling(fpm_stats, num_workers, max_tokens)
d_desired = self._agg_decode_scaling(fpm_stats, num_workers)
logger.info(
......@@ -258,6 +259,9 @@ class LoadScalingMixin:
desired = p_desired
elif d_desired is not None and d_desired > num_workers:
desired = d_desired
elif p_desired is None and d_desired is not None and d_desired < num_workers:
# Prefill signal unavailable: allow decode-only scale-down.
desired = d_desired
elif (
p_desired is not None
and p_desired < num_workers
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment