Commit 4634cbcf authored by laibao's avatar laibao
Browse files

fix: 修复 MTP runner 缺失 _extract_layer_index 导致的 KV 压缩崩溃

parent 863f93e6
...@@ -427,6 +427,27 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin): ...@@ -427,6 +427,27 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
if self.enable_expert_parallel and self.dp_size > 1 and self.tp_size > 1: if self.enable_expert_parallel and self.dp_size > 1 and self.tp_size > 1:
self.ep_sp = True self.ep_sp = True
@staticmethod
def _extract_layer_index(layer_name: str) -> int:
"""Extract attention layer index from a module name.
KV compression prompt compaction (scheme 3) needs to map
`kv_cache_group_spec.layer_names` entries to indices in `self.kv_caches`.
"""
from vllm.model_executor.models.utils import extract_layer_index
try:
return extract_layer_index(layer_name)
except Exception as e:
# Be conservative: skip layers whose names don't follow the
# expected pattern instead of crashing the whole engine.
logger.warning_once(
"Failed to parse layer index from layer name '%s': %s. "
"Skipping KV compaction for this layer.",
layer_name,
e,
)
return 1 << 30
def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None: def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
""" """
Update the order of requests in the batch based on the attention Update the order of requests in the batch based on the attention
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment