[ROCm] Enable chunked prefill/paged attention in MLA on ROCm (#14316)

Signed-off-by: Sage Moore <sage@neuralmagic.com>

[ROCm] Enable chunked prefill/paged attention in MLA on ROCm (#14316)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
d9f83d62 · Sage Moore · GitHub · 4a754fcf · d9f83d62 · d9f83d62
Unverified Commit d9f83d62 authored Mar 12, 2025 by Sage Moore Committed by GitHub Mar 12, 2025
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 18 deletions

vllm/attention/backends/mla/common.py vllm/attention/backends/mla/common.py +2 -16

vllm/config.py vllm/config.py +2 -2

No files found.
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1327,21 +1327,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
                                               [0, q.shape[-1] - v.shape[-1]],
                                               value=0)
-            if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN:
+            if is_vllm_fa:
-                attn_output, attn_softmax_lse = self.triton_fa_func(
-                    q,
-                    k,
-                    v_padded,
-                    None,
-                    prefill_metadata.query_start_loc,
-                    prefill_metadata.context_chunk_cu_seq_lens[i],
-                    prefill_metadata.max_query_len,
-                    prefill_metadata.context_chunk_max_seq_lens[i],
-                    False,  # causal
-                    self.scale,
-                    None,  # attn_mask is None unless applying ALiBi mask
-                )
-            elif is_vllm_fa:
                attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
                    q=q,
                    k=k,
@@ -1416,7 +1402,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
                                           value=0)
-        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN:
+        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and not has_context:
            output = self.triton_fa_func(
                q,
                k,

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3450,9 +3450,9 @@ class VllmConfig:
            self.compilation_config.level = CompilationLevel.NO_COMPILATION
        if self.model_config and self.model_config.use_mla and \
-            not current_platform.is_cuda():
+            not (current_platform.is_cuda() or current_platform.is_rocm()):
            logger.info(
-                "MLA is enabled on a non-cuda platform; forcing chunked "
+                "MLA is enabled on a non-GPU platform; forcing chunked "
                "prefill and prefix caching to be disabled.")
            self.scheduler_config.enable_chunked_prefill = False
            self.scheduler_config.chunked_prefill_enabled = False