Unverified Commit d185c881 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

fix(planner): don't block agg decode scaling when max_num_batched_tokens is missing (#8196)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent af0ff07c
...@@ -243,10 +243,11 @@ class LoadScalingMixin: ...@@ -243,10 +243,11 @@ class LoadScalingMixin:
d_caps = self._capabilities.decode d_caps = self._capabilities.decode
max_tokens = d_caps.max_num_batched_tokens if d_caps else None max_tokens = d_caps.max_num_batched_tokens if d_caps else None
if not max_tokens or max_tokens <= 0: if not max_tokens or max_tokens <= 0:
logger.warning("max_num_batched_tokens not available, skipping agg scaling") logger.warning(
self._diag_load_reason = "insufficient_data" "max_num_batched_tokens not available, skipping agg prefill scaling"
return None )
p_desired = None
else:
p_desired = self._agg_prefill_scaling(fpm_stats, num_workers, max_tokens) p_desired = self._agg_prefill_scaling(fpm_stats, num_workers, max_tokens)
d_desired = self._agg_decode_scaling(fpm_stats, num_workers) d_desired = self._agg_decode_scaling(fpm_stats, num_workers)
...@@ -258,6 +259,9 @@ class LoadScalingMixin: ...@@ -258,6 +259,9 @@ class LoadScalingMixin:
desired = p_desired desired = p_desired
elif d_desired is not None and d_desired > num_workers: elif d_desired is not None and d_desired > num_workers:
desired = d_desired desired = d_desired
elif p_desired is None and d_desired is not None and d_desired < num_workers:
# Prefill signal unavailable: allow decode-only scale-down.
desired = d_desired
elif ( elif (
p_desired is not None p_desired is not None
and p_desired < num_workers and p_desired < num_workers
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment