Revert "[Attention][MLA] Make `FLASHINFER_MLA` the default MLA backen… (#32484)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>

Revert "[Attention][MLA] Make `FLASHINFER_MLA` the default MLA backen… (#32484)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2e7c89e7 · Matthew Bonanni · GitHub · 037a6487 · 2e7c89e7 · 2e7c89e7
Unverified Commit 2e7c89e7 authored Jan 16, 2026 by Matthew Bonanni Committed by GitHub Jan 17, 2026
3 changed files
--- a/vllm/config/attention.py
+++ b/vllm/config/attention.py
@@ -35,7 +35,7 @@ class AttentionConfig:
    use_cudnn_prefill: bool = False
    """Whether to use cudnn prefill."""

-    use_trtllm_ragged_deepseek_prefill: bool = True
+    use_trtllm_ragged_deepseek_prefill: bool = False
    """Whether to use TRTLLM ragged deepseek prefill."""

    use_trtllm_attention: bool | None = None

--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -450,6 +450,7 @@ def use_flashinfer_prefill() -> bool:
        not vllm_config.attention_config.disable_flashinfer_prefill
        and flashinfer_available
        and not vllm_config.attention_config.use_cudnn_prefill
+        and not vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill
        and current_platform.is_device_capability_family(100)
    )

@@ -1323,27 +1324,25 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

-        if use_trtllm_ragged_deepseek_prefill():
-            logger.info_once(
-                "Using TRT-LLM ragged DeepSeek prefill for MLA", scope="local"
-            )
+        if use_flashinfer_prefill():
+            logger.debug_once("Using FlashInfer prefill for MLA")
+            self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
+            self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
+            self._pad_v = False
+        elif use_trtllm_ragged_deepseek_prefill():
+            logger.debug_once("Using TRT-LLM ragged DeepSeek prefill for MLA")
            self._run_prefill_context_chunk = (
                self._run_prefill_context_chunk_trtllm_ragged
            )
            self._run_prefill_new_tokens = self._run_prefill_new_tokens_trtllm_ragged
            self._pad_v = False
-        elif use_flashinfer_prefill():
-            logger.info_once("Using FlashInfer prefill for MLA")
-            self._run_prefill_context_chunk = self._run_prefill_context_chunk_fi
-            self._run_prefill_new_tokens = self._run_prefill_new_tokens_fi
-            self._pad_v = False
        elif use_cudnn_prefill():
-            logger.info_once("Using CUDNN prefill for MLA", scope="local")
+            logger.debug_once("Using CUDNN prefill for MLA")
            self._run_prefill_context_chunk = self._run_prefill_context_chunk_cudnn
            self._run_prefill_new_tokens = self._run_prefill_new_tokens_cudnn
            self._pad_v = False
        else:  # Use FlashAttention
-            logger.info_once("Using FlashAttention prefill for MLA", scope="local")
+            logger.debug_once("Using FlashAttention prefill for MLA")
            self._run_prefill_context_chunk = self._run_prefill_context_chunk_fa
            self._run_prefill_new_tokens = self._run_prefill_new_tokens_fa


--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -50,8 +50,8 @@ def _get_backend_priorities(
    if use_mla:
        if device_capability.major == 10:
            return [
-                AttentionBackendEnum.FLASHINFER_MLA,
                AttentionBackendEnum.CUTLASS_MLA,
+                AttentionBackendEnum.FLASHINFER_MLA,
                AttentionBackendEnum.FLASH_ATTN_MLA,
                AttentionBackendEnum.FLASHMLA,
                AttentionBackendEnum.TRITON_MLA,
@@ -183,12 +183,12 @@ class CudaPlatformBase(Platform):
            if vllm_config.attention_config.backend is None:
                # Default case
                if cls.is_device_capability_family(100) and not use_sparse:
-                    # Blackwell => Force FlashInferMLA (unless sparse, i.e. DSv3.2).
-                    use_flashinfer_mla = True
+                    # Blackwell => Force CutlassMLA (unless sparse, i.e. DSv3.2).
+                    use_cutlass_mla = True
                    # Set the backend in AttentionConfig so it's used during
                    # backend selection
                    vllm_config.attention_config.backend = (
-                        AttentionBackendEnum.FLASHINFER_MLA
+                        AttentionBackendEnum.CUTLASS_MLA
                    )
                else:
                    # Not Blackwell