fix: 修复 MTP runner 缺失 _extract_layer_index 导致的 KV 压缩崩溃

4634cbcf · laibao · 863f93e6 · 4634cbcf
Commit 4634cbcf authored Jan 23, 2026 by laibao
Show whitespace changes
Inline Side-by-side

Showing with 21 additions and 0 deletions

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +21 -0

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -427,6 +427,27 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
        if self.enable_expert_parallel and self.dp_size > 1 and self.tp_size > 1:
            self.ep_sp = True

+    @staticmethod
+    def _extract_layer_index(layer_name: str) -> int:
+        """Extract attention layer index from a module name.
+
+        KV compression prompt compaction (scheme 3) needs to map
+        `kv_cache_group_spec.layer_names` entries to indices in `self.kv_caches`.
+        """
+        from vllm.model_executor.models.utils import extract_layer_index
+        try:
+            return extract_layer_index(layer_name)
+        except Exception as e:
+            # Be conservative: skip layers whose names don't follow the
+            # expected pattern instead of crashing the whole engine.
+            logger.warning_once(
+                "Failed to parse layer index from layer name '%s': %s. "
+                "Skipping KV compaction for this layer.",
+                layer_name,
+                e,
+            )
+            return 1 << 30
+
    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
        """
        Update the order of requests in the batch based on the attention