Support FlashAttention Backend for Hybrid SSM Models (#23299)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>

Support FlashAttention Backend for Hybrid SSM Models (#23299)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2b4fc9bd · Chen Zhang · GitHub · ebd5a77b · 2b4fc9bd · 2b4fc9bd
Unverified Commit 2b4fc9bd authored Aug 26, 2025 by Chen Zhang Committed by GitHub Aug 26, 2025
Showing with 17 additions and 27 deletions

tests/models/language/generation/test_hybrid.py tests/models/language/generation/test_hybrid.py +0 -3

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +17 -24

No files found.
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -110,9 +110,6 @@ def test_models(
    if model in V1_SUPPORTED_MODELS:
        with monkeypatch.context() as m:
            m.setenv("VLLM_USE_V1", "1")
-            if model in HYBRID_MODELS:
-                # required due to reorder_batch behaviour
-                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
            with vllm_runner(model,
                             max_num_seqs=MAX_NUM_SEQS,
                             enable_prefix_caching=False) as vllm_model:

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3023,40 +3023,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                    raise NotImplementedError
        if has_attn and has_mamba:
-            self._verify_hybrid_attention_mamba_layout(kv_cache_config,
+            self._update_hybrid_attention_mamba_layout(kv_caches)
-                                                       kv_cache_raw_tensors)
        return kv_caches
-    def _verify_hybrid_attention_mamba_layout(
+    def _update_hybrid_attention_mamba_layout(
-            self, kv_cache_config: KVCacheConfig,
+            self, kv_caches: dict[str, torch.Tensor]) -> None:
-            kv_cache_raw_tensors: dict[str, torch.Tensor]) -> None:
        """
-        Verify that the KV cache memory layout is compatible for
+        Update the layout of attention layers from (2, num_blocks, ...) to
-        models with both attention and mamba KV cache groups.
+        (num_blocks, 2, ...).
        Args:
-            kv_cache_config: The KV cache config
+            kv_caches: The KV cache buffer of each layer.
-            kv_cache_raw_tensors: The KV cache buffer of each layer.
        """
        for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator():
            for layer_name in group.layer_names:
-                raw_tensor = kv_cache_raw_tensors[layer_name]
+                kv_cache = kv_caches[layer_name]
-                num_blocks = (raw_tensor.numel() //
+                if (isinstance(kv_cache_spec, AttentionSpec)
-                              kv_cache_spec.page_size_bytes)
+                        and kv_cache.shape[0] == 2):
-                if isinstance(kv_cache_spec, AttentionSpec):
+                    assert kv_cache.shape[1] != 2, \
+                        "Fail to determine whether the layout is " \
-                    kv_cache_shape = group.backend.get_kv_cache_shape(
+                        "(2, num_blocks, ...) or (num_blocks, 2, ...) for " \
-                        num_blocks, kv_cache_spec.block_size,
+                        f"a tensor of shape {kv_cache.shape}"
-                        kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
+                    hidden_size = kv_cache.shape[2:].numel()
-                    if kv_cache_shape[0] != num_blocks or kv_cache_shape[
+                    kv_cache.as_strided_(size=kv_cache.shape,
-                            1] != 2:
+                                         stride=(hidden_size, 2 * hidden_size,
-                        raise ValueError(
+                                                 *kv_cache.stride()[2:]))
-                            "Hybrid models in V1 require an attention "
-                            "backend with kv_cache_shape="
-                            "(num_blocks, 2, ...). Please try setting "
-                            "VLLM_ATTENTION_BACKEND=FLASHINFER")
    def initialize_kv_cache_tensors(
            self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]: