Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
001e50c9
Unverified
Commit
001e50c9
authored
Sep 30, 2025
by
Lucia Fang
Committed by
GitHub
Oct 01, 2025
Browse files
[Model] MTP fallback to eager for DeepSeek v32 (#25982)
Signed-off-by:
Lu Fang
<
fanglu@fb.com
>
parent
96ebcaa3
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
32 additions
and
5 deletions
+32
-5
tests/v1/spec_decode/test_eagle.py
tests/v1/spec_decode/test_eagle.py
+10
-1
tests/v1/spec_decode/test_mtp.py
tests/v1/spec_decode/test_mtp.py
+7
-1
vllm/config/speculative.py
vllm/config/speculative.py
+7
-1
vllm/v1/attention/backends/mla/indexer.py
vllm/v1/attention/backends/mla/indexer.py
+1
-1
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+7
-1
No files found.
tests/v1/spec_decode/test_eagle.py
View file @
001e50c9
...
@@ -337,13 +337,19 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
...
@@ -337,13 +337,19 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
"target_attn_1"
:
mock
.
MagicMock
(),
"target_attn_1"
:
mock
.
MagicMock
(),
"target_attn_2"
:
mock
.
MagicMock
()
"target_attn_2"
:
mock
.
MagicMock
()
}
}
target_indx_layers
:
dict
[
str
,
mock
.
MagicMock
]
=
{}
# Draft model has one extra attention layer compared to target model
# Draft model has one extra attention layer compared to target model
all_attn_layers
=
{
all_attn_layers
=
{
**
target_attn_layers
,
"draft_extra_attn"
:
mock
.
MagicMock
()
**
target_attn_layers
,
"draft_extra_attn"
:
mock
.
MagicMock
()
}
}
all_indx_layers
:
dict
[
str
,
mock
.
MagicMock
]
=
{}
# Make mock_get_layers return different values for each call
# Make mock_get_layers return different values for each call
mock_get_layers
.
side_effect
=
[
target_attn_layers
,
all_attn_layers
]
mock_get_layers
.
side_effect
=
[
target_attn_layers
,
target_indx_layers
,
all_attn_layers
,
all_indx_layers
]
# Setup mock for pp group to return the appropriate value for world size
# Setup mock for pp group to return the appropriate value for world size
mock_pp_group
=
mock
.
MagicMock
()
mock_pp_group
=
mock
.
MagicMock
()
...
@@ -658,6 +664,9 @@ def test_propose_tree(spec_token_tree):
...
@@ -658,6 +664,9 @@ def test_propose_tree(spec_token_tree):
# Mock runner for attention metadata building.
# Mock runner for attention metadata building.
proposer
.
runner
=
mock
.
MagicMock
()
proposer
.
runner
=
mock
.
MagicMock
()
proposer
.
runner
.
attn_groups
.
append
([
mock
.
MagicMock
()])
proposer
.
runner
.
attn_groups
.
append
([
mock
.
MagicMock
()])
proposer
.
runner
.
attn_groups
[
0
][
0
].
metadata_builders
=
[
attn_metadata_builder
]
proposer
.
runner
.
attn_groups
[
0
][
0
].
get_metadata_builder
.
return_value
=
\
proposer
.
runner
.
attn_groups
[
0
][
0
].
get_metadata_builder
.
return_value
=
\
attn_metadata_builder
attn_metadata_builder
proposer
.
_get_attention_metadata_builder
=
mock
.
MagicMock
(
proposer
.
_get_attention_metadata_builder
=
mock
.
MagicMock
(
...
...
tests/v1/spec_decode/test_mtp.py
View file @
001e50c9
...
@@ -63,7 +63,13 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers,
...
@@ -63,7 +63,13 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers,
target_attn_layers
=
{
"target_attn_1"
:
mock
.
MagicMock
()}
target_attn_layers
=
{
"target_attn_1"
:
mock
.
MagicMock
()}
all_attn_layers
=
{
**
target_attn_layers
,
"draft_attn_1"
:
mock
.
MagicMock
()}
all_attn_layers
=
{
**
target_attn_layers
,
"draft_attn_1"
:
mock
.
MagicMock
()}
mock_get_layers
.
side_effect
=
[
target_attn_layers
,
all_attn_layers
]
target_indexer_layers
:
dict
=
{}
all_indexer_layers
:
dict
=
{}
mock_get_layers
.
side_effect
=
[
target_attn_layers
,
target_indexer_layers
,
all_attn_layers
,
all_indexer_layers
]
mock_pp_group
=
mock
.
MagicMock
()
mock_pp_group
=
mock
.
MagicMock
()
mock_pp_group
.
world_size
=
1
mock_pp_group
.
world_size
=
1
...
...
vllm/config/speculative.py
View file @
001e50c9
...
@@ -41,7 +41,8 @@ MTP_MODEL_TYPES = ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp", "ernie_mtp",
...
@@ -41,7 +41,8 @@ MTP_MODEL_TYPES = ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp", "ernie_mtp",
@
dataclass
@
dataclass
class
SpeculativeConfig
:
class
SpeculativeConfig
:
"""Configuration for speculative decoding."""
"""Configuration for speculative decoding."""
enforce_eager
:
Optional
[
bool
]
=
None
"""Override the default enforce_eager from model_config"""
# General speculative decoding control
# General speculative decoding control
num_speculative_tokens
:
SkipValidation
[
int
]
=
None
# type: ignore
num_speculative_tokens
:
SkipValidation
[
int
]
=
None
# type: ignore
"""The number of speculative tokens, if provided. It will default to the
"""The number of speculative tokens, if provided. It will default to the
...
@@ -219,6 +220,11 @@ class SpeculativeConfig:
...
@@ -219,6 +220,11 @@ class SpeculativeConfig:
assert
(
assert
(
self
.
target_model_config
self
.
target_model_config
is
not
None
),
"target_model_config must be present for mtp"
is
not
None
),
"target_model_config must be present for mtp"
if
self
.
target_model_config
.
hf_text_config
.
model_type
\
==
"deepseek_v32"
:
# FIXME(luccafong): cudgraph with v32 MTP is not supported,
# remove this when the issue is fixed.
self
.
enforce_eager
=
True
# use the draft model from the same model:
# use the draft model from the same model:
self
.
model
=
self
.
target_model_config
.
model
self
.
model
=
self
.
target_model_config
.
model
# Align the quantization of draft model for cases such as
# Align the quantization of draft model for cases such as
...
...
vllm/v1/attention/backends/mla/indexer.py
View file @
001e50c9
...
@@ -171,7 +171,7 @@ def get_max_prefill_buffer_size(vllm_config: VllmConfig):
...
@@ -171,7 +171,7 @@ def get_max_prefill_buffer_size(vllm_config: VllmConfig):
class
DeepseekV32IndexerMetadataBuilder
(
AttentionMetadataBuilder
):
class
DeepseekV32IndexerMetadataBuilder
(
AttentionMetadataBuilder
):
cudagraph_support
:
ClassVar
[
AttentionCGSupport
]
=
\
cudagraph_support
:
ClassVar
[
AttentionCGSupport
]
=
\
AttentionCGSupport
.
UNIFORM_
BATCH
AttentionCGSupport
.
UNIFORM_
SINGLE_TOKEN_DECODE
reorder_batch_threshold
:
int
=
1
reorder_batch_threshold
:
int
=
1
...
...
vllm/v1/spec_decode/eagle.py
View file @
001e50c9
...
@@ -50,6 +50,7 @@ class EagleProposer:
...
@@ -50,6 +50,7 @@ class EagleProposer:
):
):
self
.
vllm_config
=
vllm_config
self
.
vllm_config
=
vllm_config
self
.
speculative_config
=
vllm_config
.
speculative_config
self
.
speculative_config
=
vllm_config
.
speculative_config
assert
self
.
speculative_config
is
not
None
self
.
draft_model_config
=
self
.
speculative_config
.
draft_model_config
self
.
draft_model_config
=
self
.
speculative_config
.
draft_model_config
self
.
method
=
self
.
speculative_config
.
method
self
.
method
=
self
.
speculative_config
.
method
...
@@ -74,11 +75,16 @@ class EagleProposer:
...
@@ -74,11 +75,16 @@ class EagleProposer:
vllm_config
.
model_config
)
vllm_config
.
model_config
)
self
.
attn_metadata_builder
:
Optional
[
AttentionMetadataBuilder
]
=
None
self
.
attn_metadata_builder
:
Optional
[
AttentionMetadataBuilder
]
=
None
self
.
draft_indexer_metadata_builder
:
Optional
[
AttentionMetadataBuilder
]
=
None
self
.
attn_layer_names
:
list
[
str
]
=
[]
self
.
indexer_layer_names
:
list
[
str
]
=
[]
self
.
use_cuda_graph
=
(
not
current_platform
.
is_xpu
()
self
.
use_cuda_graph
=
(
not
current_platform
.
is_xpu
()
and
self
.
vllm_config
.
compilation_config
.
level
and
self
.
vllm_config
.
compilation_config
.
level
==
CompilationLevel
.
PIECEWISE
and
==
CompilationLevel
.
PIECEWISE
and
not
self
.
vllm_config
.
model_config
.
enforce_eager
)
not
self
.
vllm_config
.
model_config
.
enforce_eager
and
not
self
.
speculative_config
.
enforce_eager
)
self
.
cudagraph_batch_sizes
=
list
(
self
.
cudagraph_batch_sizes
=
list
(
reversed
(
self
.
vllm_config
.
compilation_config
.
reversed
(
self
.
vllm_config
.
compilation_config
.
cudagraph_capture_sizes
))
if
self
.
use_cuda_graph
else
[]
cudagraph_capture_sizes
))
if
self
.
use_cuda_graph
else
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment