Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e2741f6c
Unverified
Commit
e2741f6c
authored
Nov 15, 2025
by
Cyrus Leung
Committed by
GitHub
Nov 14, 2025
Browse files
[Chore] Rename `SchedulerConfig.chunked_prefill_enabled` (#28735)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
67187554
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
21 additions
and
19 deletions
+21
-19
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+0
-1
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+4
-6
tests/v1/engine/test_engine_core.py
tests/v1/engine/test_engine_core.py
+1
-1
vllm/config/scheduler.py
vllm/config/scheduler.py
+8
-3
vllm/config/vllm.py
vllm/config/vllm.py
+3
-3
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+1
-1
vllm/v1/core/sched/scheduler.py
vllm/v1/core/sched/scheduler.py
+1
-1
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+1
-1
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+2
-2
No files found.
tests/v1/core/test_scheduler.py
View file @
e2741f6c
...
@@ -2282,7 +2282,6 @@ def _validate_chunked_prefill_settings_for_encoder_decoder(
...
@@ -2282,7 +2282,6 @@ def _validate_chunked_prefill_settings_for_encoder_decoder(
)
->
None
:
)
->
None
:
"""Validate chunked prefill settings in the scheduler config for
"""Validate chunked prefill settings in the scheduler config for
encoder-decoder models."""
encoder-decoder models."""
assert
scheduler_config
.
chunked_prefill_enabled
is
expect_enabled
assert
scheduler_config
.
enable_chunked_prefill
is
expect_enabled
assert
scheduler_config
.
enable_chunked_prefill
is
expect_enabled
if
is_encoder_decoder
:
if
is_encoder_decoder
:
# Encoder-decoder models should automatically disable chunked multimodal
# Encoder-decoder models should automatically disable chunked multimodal
...
...
tests/v1/e2e/test_spec_decode.py
View file @
e2741f6c
...
@@ -272,7 +272,7 @@ def test_speculators_model_integration(
...
@@ -272,7 +272,7 @@ def test_speculators_model_integration(
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
[
"model_setup"
,
"mm_enabled"
,
"chunked_prefill
_enabled
"
],
[
"model_setup"
,
"mm_enabled"
,
"
enable_
chunked_prefill"
],
[
[
((
"eagle3"
,
"Qwen/Qwen3-8B"
,
"AngelSlim/Qwen3-8B_eagle3"
,
1
),
False
,
False
),
((
"eagle3"
,
"Qwen/Qwen3-8B"
,
"AngelSlim/Qwen3-8B_eagle3"
,
1
),
False
,
False
),
pytest
.
param
(
pytest
.
param
(
...
@@ -358,7 +358,7 @@ def test_eagle_correctness(
...
@@ -358,7 +358,7 @@ def test_eagle_correctness(
sampling_config
:
SamplingParams
,
sampling_config
:
SamplingParams
,
model_setup
:
tuple
[
str
,
str
,
str
,
int
],
model_setup
:
tuple
[
str
,
str
,
str
,
int
],
mm_enabled
:
bool
,
mm_enabled
:
bool
,
chunked_prefill
_enabled
:
bool
,
enable_
chunked_prefill
:
bool
,
attn_backend
:
str
,
attn_backend
:
str
,
):
):
if
attn_backend
==
"TREE_ATTN"
:
if
attn_backend
==
"TREE_ATTN"
:
...
@@ -396,9 +396,7 @@ def test_eagle_correctness(
...
@@ -396,9 +396,7 @@ def test_eagle_correctness(
method
,
model_name
,
spec_model_name
,
tp_size
=
model_setup
method
,
model_name
,
spec_model_name
,
tp_size
=
model_setup
max_model_len
=
2048
max_model_len
=
2048
max_num_batched_tokens
=
max_model_len
max_num_batched_tokens
=
128
if
enable_chunked_prefill
else
max_model_len
if
chunked_prefill_enabled
:
max_num_batched_tokens
=
128
ref_llm
=
LLM
(
ref_llm
=
LLM
(
model
=
model_name
,
max_model_len
=
max_model_len
,
tensor_parallel_size
=
tp_size
model
=
model_name
,
max_model_len
=
max_model_len
,
tensor_parallel_size
=
tp_size
...
@@ -420,7 +418,7 @@ def test_eagle_correctness(
...
@@ -420,7 +418,7 @@ def test_eagle_correctness(
},
},
max_model_len
=
max_model_len
,
max_model_len
=
max_model_len
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
chunked_prefill
_enabled
,
enable_chunked_prefill
=
enable_
chunked_prefill
,
)
)
spec_outputs
=
spec_llm
.
chat
(
test_prompts
,
sampling_config
)
spec_outputs
=
spec_llm
.
chat
(
test_prompts
,
sampling_config
)
matches
=
0
matches
=
0
...
...
tests/v1/engine/test_engine_core.py
View file @
e2741f6c
...
@@ -571,7 +571,7 @@ def test_encoder_instance_zero_kv_cache(
...
@@ -571,7 +571,7 @@ def test_encoder_instance_zero_kv_cache(
)
)
# Check 5: Verify chunked prefill is disabled
# Check 5: Verify chunked prefill is disabled
assert
not
vllm_config
.
scheduler_config
.
chunked_prefill
_enabled
,
(
assert
not
vllm_config
.
scheduler_config
.
enable_
chunked_prefill
,
(
"Encoder instance should disable chunked prefill (no KV cache)"
"Encoder instance should disable chunked prefill (no KV cache)"
)
)
...
...
vllm/config/scheduler.py
View file @
e2741f6c
...
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
...
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
from
pydantic
import
Field
,
field_validator
,
model_validator
from
pydantic
import
Field
,
field_validator
,
model_validator
from
pydantic.dataclasses
import
dataclass
from
pydantic.dataclasses
import
dataclass
from
typing_extensions
import
Self
from
typing_extensions
import
Self
,
deprecated
from
vllm.config.utils
import
config
from
vllm.config.utils
import
config
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -233,6 +233,11 @@ class SchedulerConfig:
...
@@ -233,6 +233,11 @@ class SchedulerConfig:
)
)
@
property
@
property
@
deprecated
(
"`SchedulerConfig.chunked_prefill_enabled` has been renamed to "
"`SchedulerConfig.enable_chunked_prefill`. "
"The old name will be removed in v0.12."
)
def
chunked_prefill_enabled
(
self
)
->
bool
:
def
chunked_prefill_enabled
(
self
)
->
bool
:
return
self
.
enable_chunked_prefill
return
self
.
enable_chunked_prefill
...
@@ -244,7 +249,7 @@ class SchedulerConfig:
...
@@ -244,7 +249,7 @@ class SchedulerConfig:
def
_verify_args
(
self
)
->
Self
:
def
_verify_args
(
self
)
->
Self
:
if
(
if
(
self
.
max_num_batched_tokens
<
self
.
max_model_len
self
.
max_num_batched_tokens
<
self
.
max_model_len
and
not
self
.
chunked_prefill
_enabled
and
not
self
.
enable_
chunked_prefill
):
):
raise
ValueError
(
raise
ValueError
(
f
"max_num_batched_tokens (
{
self
.
max_num_batched_tokens
}
) is "
f
"max_num_batched_tokens (
{
self
.
max_num_batched_tokens
}
) is "
...
@@ -271,7 +276,7 @@ class SchedulerConfig:
...
@@ -271,7 +276,7 @@ class SchedulerConfig:
)
)
if
self
.
max_num_partial_prefills
>
1
:
if
self
.
max_num_partial_prefills
>
1
:
if
not
self
.
chunked_prefill
_enabled
:
if
not
self
.
enable_
chunked_prefill
:
raise
ValueError
(
raise
ValueError
(
"Chunked prefill must be enabled to set "
"Chunked prefill must be enabled to set "
"max_num_partial_prefills > 1."
"max_num_partial_prefills > 1."
...
...
vllm/config/vllm.py
View file @
e2741f6c
...
@@ -411,7 +411,7 @@ class VllmConfig:
...
@@ -411,7 +411,7 @@ class VllmConfig:
if
(
if
(
self
.
model_config
is
not
None
self
.
model_config
is
not
None
and
self
.
scheduler_config
.
chunked_prefill
_enabled
and
self
.
scheduler_config
.
enable_
chunked_prefill
and
self
.
model_config
.
dtype
==
torch
.
float32
and
self
.
model_config
.
dtype
==
torch
.
float32
and
current_platform
.
get_device_capability
()
==
(
7
,
5
)
and
current_platform
.
get_device_capability
()
==
(
7
,
5
)
):
):
...
@@ -584,7 +584,7 @@ class VllmConfig:
...
@@ -584,7 +584,7 @@ class VllmConfig:
):
):
for
reason
in
disable_chunked_prefill_reasons
:
for
reason
in
disable_chunked_prefill_reasons
:
logger
.
info
(
reason
)
logger
.
info
(
reason
)
self
.
scheduler_config
.
chunked_prefill
_enabled
=
False
self
.
scheduler_config
.
enable_
chunked_prefill
=
False
self
.
scheduler_config
.
long_prefill_token_threshold
=
0
self
.
scheduler_config
.
long_prefill_token_threshold
=
0
if
self
.
cache_config
is
not
None
:
if
self
.
cache_config
is
not
None
:
...
@@ -1026,7 +1026,7 @@ class VllmConfig:
...
@@ -1026,7 +1026,7 @@ class VllmConfig:
f
"seed=
{
self
.
model_config
.
seed
}
, "
f
"seed=
{
self
.
model_config
.
seed
}
, "
f
"served_model_name=
{
self
.
model_config
.
served_model_name
}
, "
f
"served_model_name=
{
self
.
model_config
.
served_model_name
}
, "
f
"enable_prefix_caching=
{
self
.
cache_config
.
enable_prefix_caching
}
, "
f
"enable_prefix_caching=
{
self
.
cache_config
.
enable_prefix_caching
}
, "
f
"chunked_prefill
_enabled
=
{
self
.
scheduler_config
.
chunked_prefill
_enabled
}
, "
# noqa
f
"
enable_
chunked_prefill=
{
self
.
scheduler_config
.
enable_
chunked_prefill
}
, "
# noqa
f
"pooler_config=
{
self
.
model_config
.
pooler_config
!
r
}
, "
f
"pooler_config=
{
self
.
model_config
.
pooler_config
!
r
}
, "
f
"compilation_config=
{
self
.
compilation_config
!
r
}
"
f
"compilation_config=
{
self
.
compilation_config
!
r
}
"
)
)
...
...
vllm/platforms/cpu.py
View file @
e2741f6c
...
@@ -192,7 +192,7 @@ class CpuPlatform(Platform):
...
@@ -192,7 +192,7 @@ class CpuPlatform(Platform):
scheduler_config
=
vllm_config
.
scheduler_config
scheduler_config
=
vllm_config
.
scheduler_config
if
(
if
(
scheduler_config
.
chunked_prefill
_enabled
scheduler_config
.
enable_
chunked_prefill
or
cache_config
.
enable_prefix_caching
or
cache_config
.
enable_prefix_caching
)
and
cache_config
.
cache_dtype
!=
"auto"
:
)
and
cache_config
.
cache_dtype
!=
"auto"
:
raise
RuntimeError
(
raise
RuntimeError
(
...
...
vllm/v1/core/sched/scheduler.py
View file @
e2741f6c
...
@@ -497,7 +497,7 @@ class Scheduler(SchedulerInterface):
...
@@ -497,7 +497,7 @@ class Scheduler(SchedulerInterface):
# chunked prefill has to be enabled explicitly to allow
# chunked prefill has to be enabled explicitly to allow
# pooling requests to be chunked
# pooling requests to be chunked
if
(
if
(
not
self
.
scheduler_config
.
chunked_prefill
_enabled
not
self
.
scheduler_config
.
enable_
chunked_prefill
and
num_new_tokens
>
token_budget
and
num_new_tokens
>
token_budget
):
):
self
.
waiting
.
pop_request
()
self
.
waiting
.
pop_request
()
...
...
vllm/v1/engine/core.py
View file @
e2741f6c
...
@@ -124,7 +124,7 @@ class EngineCore:
...
@@ -124,7 +124,7 @@ class EngineCore:
# Encoder models without KV cache don't support
# Encoder models without KV cache don't support
# chunked prefill. But do SSM models?
# chunked prefill. But do SSM models?
logger
.
info
(
"Disabling chunked prefill for model without KVCache"
)
logger
.
info
(
"Disabling chunked prefill for model without KVCache"
)
vllm_config
.
scheduler_config
.
chunked_prefill
_enabled
=
False
vllm_config
.
scheduler_config
.
enable_
chunked_prefill
=
False
scheduler_block_size
=
(
scheduler_block_size
=
(
vllm_config
.
cache_config
.
block_size
vllm_config
.
cache_config
.
block_size
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
e2741f6c
...
@@ -2031,7 +2031,7 @@ class GPUModelRunner(
...
@@ -2031,7 +2031,7 @@ class GPUModelRunner(
supported_tasks
=
list
(
model
.
pooler
.
get_supported_tasks
())
supported_tasks
=
list
(
model
.
pooler
.
get_supported_tasks
())
if
self
.
scheduler_config
.
chunked_prefill
_enabled
:
if
self
.
scheduler_config
.
enable_
chunked_prefill
:
if
"token_embed"
in
supported_tasks
:
if
"token_embed"
in
supported_tasks
:
supported_tasks
.
remove
(
"token_embed"
)
supported_tasks
.
remove
(
"token_embed"
)
if
"token_classify"
in
supported_tasks
:
if
"token_classify"
in
supported_tasks
:
...
@@ -3825,7 +3825,7 @@ class GPUModelRunner(
...
@@ -3825,7 +3825,7 @@ class GPUModelRunner(
supported_pooling_tasks
=
self
.
get_supported_pooling_tasks
()
supported_pooling_tasks
=
self
.
get_supported_pooling_tasks
()
if
not
supported_pooling_tasks
:
if
not
supported_pooling_tasks
:
if
self
.
scheduler_config
.
chunked_prefill
_enabled
:
if
self
.
scheduler_config
.
enable_
chunked_prefill
:
raise
RuntimeError
(
raise
RuntimeError
(
f
"Model
{
self
.
model_config
.
model
}
does not support "
f
"Model
{
self
.
model_config
.
model
}
does not support "
"any pooling tasks with chunked prefill enabled. "
"any pooling tasks with chunked prefill enabled. "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment