Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9e5bd307
Unverified
Commit
9e5bd307
authored
Oct 31, 2025
by
Nick Hill
Committed by
GitHub
Oct 31, 2025
Browse files
[Cleanup] Remove no-longer-used `SpeculativeConfig.enable_chunked_prefill` (#27826)
Signed-off-by:
Nick Hill
<
nhill@redhat.com
>
parent
fc16f1c4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
1 addition
and
16 deletions
+1
-16
vllm/config/speculative.py
vllm/config/speculative.py
+0
-10
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+0
-6
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+1
-0
No files found.
vllm/config/speculative.py
View file @
9e5bd307
...
@@ -78,10 +78,6 @@ class SpeculativeConfig:
...
@@ -78,10 +78,6 @@ class SpeculativeConfig:
draft_tensor_parallel_size
:
int
|
None
=
Field
(
default
=
None
,
ge
=
1
)
draft_tensor_parallel_size
:
int
|
None
=
Field
(
default
=
None
,
ge
=
1
)
"""The degree of the tensor parallelism for the draft model. Can only be 1
"""The degree of the tensor parallelism for the draft model. Can only be 1
or the same as the target model's tensor parallel size."""
or the same as the target model's tensor parallel size."""
disable_logprobs
:
bool
=
True
"""If set to True, token log probabilities are not returned during
speculative decoding. If set to False, token log probabilities are returned
according to the log probability settings in SamplingParams."""
# Draft model configuration
# Draft model configuration
quantization
:
me_quant
.
QuantizationMethods
|
None
=
None
quantization
:
me_quant
.
QuantizationMethods
|
None
=
None
...
@@ -126,12 +122,6 @@ class SpeculativeConfig:
...
@@ -126,12 +122,6 @@ class SpeculativeConfig:
"""The configuration of the target model."""
"""The configuration of the target model."""
target_parallel_config
:
SkipValidation
[
ParallelConfig
]
=
None
# type: ignore
target_parallel_config
:
SkipValidation
[
ParallelConfig
]
=
None
# type: ignore
"""The parallel configuration for the target model."""
"""The parallel configuration for the target model."""
enable_chunked_prefill
:
SkipValidation
[
bool
]
=
None
# type: ignore
"""Whether vLLM is configured to use chunked prefill or not. Used for
raising an error since it's not yet compatible with speculative decode."""
disable_log_stats
:
SkipValidation
[
bool
]
=
None
# type: ignore
"""Whether to disable the periodic printing of stage times in speculative
decoding."""
# params generated in the post-init stage
# params generated in the post-init stage
draft_model_config
:
SkipValidation
[
ModelConfig
]
=
None
# type: ignore
draft_model_config
:
SkipValidation
[
ModelConfig
]
=
None
# type: ignore
...
...
vllm/engine/arg_utils.py
View file @
9e5bd307
...
@@ -1246,8 +1246,6 @@ class EngineArgs:
...
@@ -1246,8 +1246,6 @@ class EngineArgs:
self
,
self
,
target_model_config
:
ModelConfig
,
target_model_config
:
ModelConfig
,
target_parallel_config
:
ParallelConfig
,
target_parallel_config
:
ParallelConfig
,
enable_chunked_prefill
:
bool
,
disable_log_stats
:
bool
,
)
->
SpeculativeConfig
|
None
:
)
->
SpeculativeConfig
|
None
:
"""Initializes and returns a SpeculativeConfig object based on
"""Initializes and returns a SpeculativeConfig object based on
`speculative_config`.
`speculative_config`.
...
@@ -1267,8 +1265,6 @@ class EngineArgs:
...
@@ -1267,8 +1265,6 @@ class EngineArgs:
{
{
"target_model_config"
:
target_model_config
,
"target_model_config"
:
target_model_config
,
"target_parallel_config"
:
target_parallel_config
,
"target_parallel_config"
:
target_parallel_config
,
"enable_chunked_prefill"
:
enable_chunked_prefill
,
"disable_log_stats"
:
disable_log_stats
,
}
}
)
)
return
SpeculativeConfig
(
**
self
.
speculative_config
)
return
SpeculativeConfig
(
**
self
.
speculative_config
)
...
@@ -1561,8 +1557,6 @@ class EngineArgs:
...
@@ -1561,8 +1557,6 @@ class EngineArgs:
speculative_config
=
self
.
create_speculative_config
(
speculative_config
=
self
.
create_speculative_config
(
target_model_config
=
model_config
,
target_model_config
=
model_config
,
target_parallel_config
=
parallel_config
,
target_parallel_config
=
parallel_config
,
enable_chunked_prefill
=
self
.
enable_chunked_prefill
,
disable_log_stats
=
self
.
disable_log_stats
,
)
)
# make sure num_lookahead_slots is set appropriately depending on
# make sure num_lookahead_slots is set appropriately depending on
...
...
vllm/entrypoints/openai/api_server.py
View file @
9e5bd307
...
@@ -241,6 +241,7 @@ async def build_async_engine_client_from_engine_args(
...
@@ -241,6 +241,7 @@ async def build_async_engine_client_from_engine_args(
)
)
# Don't keep the dummy data in memory
# Don't keep the dummy data in memory
assert
async_llm
is
not
None
await
async_llm
.
reset_mm_cache
()
await
async_llm
.
reset_mm_cache
()
yield
async_llm
yield
async_llm
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment