Merge branch 'v0.9.2-dev-wm' into 'v0.9.2-dev'

[fix]修复v1 mtp接受率低的问题 See merge request dcutoolkit/deeplearing/vllm!174

Merge branch 'v0.9.2-dev-wm' into 'v0.9.2-dev'
[fix]修复v1 mtp接受率低的问题 See merge request dcutoolkit/deeplearing/vllm!174
8e95b5e2 · gaoqiong · dea49b15 · 3dad13fb · 8e95b5e2 · 8e95b5e2
Commit 8e95b5e2 authored Aug 10, 2025 by gaoqiong
4 changed files
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4106,7 +4106,7 @@ class CompilationConfig:
    are always used, it can set this to False. Otherwise, it should
    set this to True, and the compiler will copy the input to an
    internally managed buffer. Default is False."""
-    full_cuda_graph: bool = True
+    full_cuda_graph: bool = False
    """whether to use a full cuda graph for the entire forward pass rather than
    splitting certain operations such as attention into subgraphs. Thus this
    flag cannot be used together with splitting_ops. This may provide
@@ -4948,4 +4948,4 @@ def get_layers_from_vllm_config(vllm_config: VllmConfig,
        for layer_name, layer in
        vllm_config.compilation_config.static_forward_context.items()
        if isinstance(layer, layer_type)
-    }
\ No newline at end of file
+    }
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -546,11 +546,14 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
        device = self.runner.device
        block_table = self.block_table
        block_table_tensor = block_table.get_device_tensor()[:num_reqs]
-        block_table.slot_mapping[:num_actual_tokens].copy_(
-            block_table.slot_mapping_cpu[:num_actual_tokens],
-            non_blocking=True)
-        block_table.slot_mapping[num_actual_tokens:].fill_(-1)
-        slot_mapping = block_table.slot_mapping[:num_actual_tokens]
+
+        slot_mapping = common_attn_metadata.slot_mapping
+        if slot_mapping is None:
+            block_table.slot_mapping[:num_actual_tokens].copy_(
+                block_table.slot_mapping_cpu[:num_actual_tokens],
+                non_blocking=True)
+            block_table.slot_mapping[num_actual_tokens:].fill_(-1)
+            slot_mapping = block_table.slot_mapping[:num_actual_tokens]

        query_start_loc = common_attn_metadata.query_start_loc
        seq_lens = common_attn_metadata.seq_lens

--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -45,6 +45,8 @@ class CommonAttentionMetadata:
    """(batch_size,), record the rejected tokens number in cpu and gpu"""
    num_speculative_tokens: int = 0
    """Number of speculative tokens"""
+    slot_mapping: torch.Tensor = None
+    """(batch_size, seq_len), slot mapping"""


 M = TypeVar("M")

--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -159,6 +159,7 @@ class EagleProposer:
                num_actual_tokens=num_tokens,
                max_query_len=max_query_len,
                num_rejected_tokens=num_rejected_tokens,
+                slot_mapping=target_slot_mapping
            )

            assert self.runner is not None