update flashmla support

1e9788a3 · zhuwenwen · eb38edbc · 1e9788a3 · 1e9788a3
Commit 1e9788a3 authored Oct 03, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 42 additions and 42 deletions

vllm/attention/ops/flashmla.py vllm/attention/ops/flashmla.py +2 -2

vllm/platforms/rocm.py vllm/platforms/rocm.py +40 -40

No files found.
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -154,8 +154,8 @@ def flash_mla_with_kvcache(
    else:
        if current_platform.is_rocm():
            out, softmax_lse = flash_mla_cuda.fwd_kvcache_mla(
-                q, k_cache, block_table, cache_seqlens, head_dim_v, tile_scheduler_metadata,
-                num_splits, softmax_scale, causal)
+                q, k_cache, None, head_dim_v, cache_seqlens, block_table, softmax_scale,
+                causal, tile_scheduler_metadata, num_splits)
        else:
            out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla(
                q, k_cache, head_dim_v, cache_seqlens, block_table, softmax_scale,

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -208,7 +208,22 @@ class RocmPlatform(Platform):

            # from vllm.v1.attention.backends.mla.rocm_aiter_mla import (
            #     is_aiter_mla_enabled)
+            if envs.VLLM_USE_FLASH_MLA:
+                from vllm.attention.ops.flashmla import is_flashmla_supported
+                use_flashmla = selected_backend == _Backend.FLASHMLA or (
+                    selected_backend is None and is_flashmla_supported()[0])
                
+                if use_flashmla:
+                    if block_size != 64:
+                        logger.warning(
+                            "FlashMLA backend is not supported for block size %d"
+                            " (currently only supports block size 64).",
+                            block_size)
+                    else:
+                        logger.info_once("Using FlashMLA backend on V1 engine.")
+                        return ("vllm.v1.attention.backends.mla."
+                                "flashmla.FlashMLABackend")
+            else:
                if selected_backend is None:
                    # selected_backend = (_Backend.ROCM_AITER_MLA if
                    #                     is_aiter_mla_enabled() or block_size == 1
@@ -251,21 +266,6 @@ class RocmPlatform(Platform):
                logger.info("Using Rocm/Aiter Attention backend on V1 engine.")
                return ("vllm.v1.attention.backends."
                        "rocm_attn.RocmAttentionBackend")
-            if envs.VLLM_USE_FLASH_MLA:
-                from vllm.attention.ops.flashmla import is_flashmla_supported
-                use_flashmla = selected_backend == _Backend.FLASHMLA or (
-                    selected_backend is None and is_flashmla_supported()[0])
-                
-                if use_flashmla:
-                    if block_size != 64:
-                        logger.warning(
-                            "FlashMLA backend is not supported for block size %d"
-                            " (currently only supports block size 64).",
-                            block_size)
-                    else:
-                        logger.info_once("Using FlashMLA backend on V1 engine.")
-                        return ("vllm.v1.attention.backends.mla."
-                                "flashmla.FlashMLABackend")
            else:
                # default case, using triton unified attention
                logger.info("Using Triton Attention backend on V1 engine.")