Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
52ce1420
Unverified
Commit
52ce1420
authored
Aug 16, 2025
by
Maximilien de Bayser
Committed by
GitHub
Aug 16, 2025
Browse files
Fix handling of `max_num_batched_tokens` for pooling tasks (#23004)
Signed-off-by:
Max de Bayser
<
mbayser@br.ibm.com
>
parent
829bbd78
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
5 additions
and
8 deletions
+5
-8
vllm/config/__init__.py
vllm/config/__init__.py
+0
-3
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+5
-5
No files found.
vllm/config/__init__.py
View file @
52ce1420
...
@@ -3600,9 +3600,6 @@ class VllmConfig:
...
@@ -3600,9 +3600,6 @@ class VllmConfig:
logger
.
info
(
reason
)
logger
.
info
(
reason
)
self
.
scheduler_config
.
chunked_prefill_enabled
=
False
self
.
scheduler_config
.
chunked_prefill_enabled
=
False
self
.
scheduler_config
.
long_prefill_token_threshold
=
0
self
.
scheduler_config
.
long_prefill_token_threshold
=
0
self
.
scheduler_config
.
max_num_batched_tokens
=
max
(
self
.
scheduler_config
.
max_model_len
,
DEFAULT_MAX_NUM_BATCHED_TOKENS
)
if
self
.
cache_config
is
not
None
:
if
self
.
cache_config
is
not
None
:
self
.
cache_config
.
enable_prefix_caching
=
False
self
.
cache_config
.
enable_prefix_caching
=
False
...
...
vllm/engine/arg_utils.py
View file @
52ce1420
...
@@ -1602,9 +1602,6 @@ class EngineArgs:
...
@@ -1602,9 +1602,6 @@ class EngineArgs:
self
.
enable_prefix_caching
=
incremental_prefill_supported
self
.
enable_prefix_caching
=
incremental_prefill_supported
logger
.
info
(
"(%s) prefix caching by default"
,
action
)
logger
.
info
(
"(%s) prefix caching by default"
,
action
)
if
not
self
.
enable_chunked_prefill
:
self
.
max_num_batched_tokens
=
model_config
.
max_model_len
# V1 should use the new scheduler by default.
# V1 should use the new scheduler by default.
# Swap it only if this arg is set to the original V0 default
# Swap it only if this arg is set to the original V0 default
if
self
.
scheduler_cls
==
EngineArgs
.
scheduler_cls
:
if
self
.
scheduler_cls
==
EngineArgs
.
scheduler_cls
:
...
@@ -1692,8 +1689,11 @@ class EngineArgs:
...
@@ -1692,8 +1689,11 @@ class EngineArgs:
self
.
max_num_batched_tokens
=
\
self
.
max_num_batched_tokens
=
\
default_max_num_batched_tokens
[
usage_context
]
default_max_num_batched_tokens
[
usage_context
]
else
:
else
:
self
.
max_num_batched_tokens
=
default_max_num_batched_tokens
[
if
not
self
.
enable_chunked_prefill
:
usage_context
]
self
.
max_num_batched_tokens
=
model_config
.
max_model_len
else
:
self
.
max_num_batched_tokens
=
\
default_max_num_batched_tokens
[
usage_context
]
logger
.
debug
(
logger
.
debug
(
"Setting max_num_batched_tokens to %d for %s usage context."
,
"Setting max_num_batched_tokens to %d for %s usage context."
,
self
.
max_num_batched_tokens
,
use_context_value
)
self
.
max_num_batched_tokens
,
use_context_value
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment