Unverified Commit 52ce1420 authored by Maximilien de Bayser's avatar Maximilien de Bayser Committed by GitHub
Browse files

Fix handling of `max_num_batched_tokens` for pooling tasks (#23004)


Signed-off-by: default avatarMax de Bayser <mbayser@br.ibm.com>
parent 829bbd78
......@@ -3600,9 +3600,6 @@ class VllmConfig:
logger.info(reason)
self.scheduler_config.chunked_prefill_enabled = False
self.scheduler_config.long_prefill_token_threshold = 0
self.scheduler_config.max_num_batched_tokens = max(
self.scheduler_config.max_model_len,
DEFAULT_MAX_NUM_BATCHED_TOKENS)
if self.cache_config is not None:
self.cache_config.enable_prefix_caching = False
......
......@@ -1602,9 +1602,6 @@ class EngineArgs:
self.enable_prefix_caching = incremental_prefill_supported
logger.info("(%s) prefix caching by default", action)
if not self.enable_chunked_prefill:
self.max_num_batched_tokens = model_config.max_model_len
# V1 should use the new scheduler by default.
# Swap it only if this arg is set to the original V0 default
if self.scheduler_cls == EngineArgs.scheduler_cls:
......@@ -1692,8 +1689,11 @@ class EngineArgs:
self.max_num_batched_tokens = \
default_max_num_batched_tokens[usage_context]
else:
self.max_num_batched_tokens = default_max_num_batched_tokens[
usage_context]
if not self.enable_chunked_prefill:
self.max_num_batched_tokens = model_config.max_model_len
else:
self.max_num_batched_tokens = \
default_max_num_batched_tokens[usage_context]
logger.debug(
"Setting max_num_batched_tokens to %d for %s usage context.",
self.max_num_batched_tokens, use_context_value)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment