Merge branch 'v0.15.1-dev-wm' into 'v0.15.1-dev'

[feat]deepseek mtp支持pp模式 See merge request dcutoolkit/deeplearing/vllm!503

Merge branch 'v0.15.1-dev-wm' into 'v0.15.1-dev'
[feat]deepseek mtp支持pp模式 See merge request dcutoolkit/deeplearing/vllm!503
29646389 · wangmin6 · ba2f6226 · 2ce72b9c · 29646389 · 29646389
Commit 29646389 authored Mar 16, 2026 by wangmin6
Hide whitespace changes
Inline Side-by-side

Showing with 38 additions and 35 deletions

vllm/model_executor/models/deepseek_mtp.py vllm/model_executor/models/deepseek_mtp.py +2 -1

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +36 -34

No files found.
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -37,6 +37,7 @@ from .deepseek_v2 import (
    get_spec_layer_idx_from_weight_name,
 )
 from .utils import maybe_prefix
+from .interfaces import SupportsPP
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.blockwise_int8 import BlockInt8Config
 import vllm.envs as envs
@@ -194,7 +195,7 @@ class DeepSeekMultiTokenPredictor(nn.Module):


 @support_torch_compile
-class DeepSeekMTP(nn.Module, DeepseekV2MixtureOfExperts):
+class DeepSeekMTP(nn.Module, DeepseekV2MixtureOfExperts, SupportsPP):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        self.config = vllm_config.model_config.hf_config

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1902,7 +1902,7 @@ class GPUModelRunner(
                cm.block_table_tensor = _get_block_table(kv_cache_gid)
                cm.slot_mapping = slot_mappings[kv_cache_gid]

-            if self.speculative_config and spec_decode_common_attn_metadata is None:
+            if self.speculative_config and spec_decode_common_attn_metadata is None and hasattr(self, "drafter"):
                if isinstance(self.drafter, EagleProposer):
                    if self.drafter.attn_layer_names[0] in kv_cache_group.layer_names:
                        spec_decode_common_attn_metadata = cm
@@ -4840,35 +4840,36 @@ class GPUModelRunner(
                self.speculative_config.use_eagle()
                or self.speculative_config.uses_draft_model()
            ):
-                assert isinstance(self.drafter, EagleProposer | DraftModelProposer)
-                assert self.speculative_config is not None
-                # Eagle currently only supports PIECEWISE cudagraphs.
-                # Therefore only use cudagraphs if the main model uses PIECEWISE
-                # NOTE(lucas): this is a hack, need to clean up.
-                use_cudagraphs = (
-                    (
-                        is_graph_capturing
-                        and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
-                    )
-                    or (
-                        not is_graph_capturing
-                        and cudagraph_runtime_mode != CUDAGraphMode.NONE
+                #assert isinstance(self.drafter, EagleProposer | DraftModelProposer)
+                if hasattr(self, "drafter") and isinstance(self.drafter, EagleProposer | DraftModelProposer):
+                    assert self.speculative_config is not None
+                    # Eagle currently only supports PIECEWISE cudagraphs.
+                    # Therefore only use cudagraphs if the main model uses PIECEWISE
+                    # NOTE(lucas): this is a hack, need to clean up.
+                    use_cudagraphs = (
+                        (
+                            is_graph_capturing
+                            and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+                        )
+                        or (
+                            not is_graph_capturing
+                            and cudagraph_runtime_mode != CUDAGraphMode.NONE
+                        )
+                    ) and not self.speculative_config.enforce_eager
+
+                    # Note(gnovack) - We need to disable cudagraphs for one of the two
+                    # lora cases when cudagraph_specialize_lora is enabled. This is a
+                    # short term mitigation for issue mentioned in
+                    # https://github.com/vllm-project/vllm/issues/28334
+                    if self.compilation_config.cudagraph_specialize_lora and activate_lora:
+                        use_cudagraphs = False
+
+                    self.drafter.dummy_run(
+                        num_tokens,
+                        use_cudagraphs=use_cudagraphs,
+                        is_graph_capturing=is_graph_capturing,
+                        slot_mappings=slot_mappings,
                    )
-                ) and not self.speculative_config.enforce_eager
-
-                # Note(gnovack) - We need to disable cudagraphs for one of the two
-                # lora cases when cudagraph_specialize_lora is enabled. This is a
-                # short term mitigation for issue mentioned in
-                # https://github.com/vllm-project/vllm/issues/28334
-                if self.compilation_config.cudagraph_specialize_lora and activate_lora:
-                    use_cudagraphs = False
-
-                self.drafter.dummy_run(
-                    num_tokens,
-                    use_cudagraphs=use_cudagraphs,
-                    is_graph_capturing=is_graph_capturing,
-                    slot_mappings=slot_mappings,
-                )

        # We register layerwise NVTX hooks here after the first dynamo tracing is
        # done to avoid nvtx operations in hook functions being traced by
@@ -5544,7 +5545,7 @@ class GPUModelRunner(
        )

        # Initialize eagle's cudagraph dispatcher if using eagle spec decode.
-        if self.speculative_config and self.speculative_config.use_eagle():
+        if self.speculative_config and self.speculative_config.use_eagle() and hasattr(self, "drafter"):
            assert isinstance(self.drafter, EagleProposer)
            self.drafter.initialize_cudagraph_keys(cudagraph_mode)

@@ -6091,10 +6092,11 @@ class GPUModelRunner(
            self.speculative_config.use_eagle()
            or self.speculative_config.uses_draft_model()
        ):
-            assert isinstance(self.drafter, EagleProposer | DraftModelProposer)
-            # validate all draft model layers belong to the same kv cache
-            # group
-            self.drafter.validate_same_kv_cache_group(kv_cache_config)
+            #assert isinstance(self.drafter, EagleProposer | DraftModelProposer)
+            if hasattr(self, "drafter") and isinstance(self.drafter, EagleProposer | DraftModelProposer):
+                # validate all draft model layers belong to the same kv cache
+                # group
+                self.drafter.validate_same_kv_cache_group(kv_cache_config)

        if has_kv_transfer_group():
            kv_transfer_group = get_kv_transfer_group()