[Bugfix] Fix default enable for CUTLASS MLA on SM100 (#22738)

Signed-off-by: mgoin <mgoin64@gmail.com>

[Bugfix] Fix default enable for CUTLASS MLA on SM100 (#22738)
Signed-off-by: mgoin <mgoin64@gmail.com>
b1361c72 · Michael Goin · GitHub · 4f0f844b · b1361c72
Unverified Commit b1361c72 authored Aug 13, 2025 by Michael Goin Committed by GitHub Aug 12, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 1 deletion

vllm/platforms/cuda.py vllm/platforms/cuda.py +6 -1

No files found.
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -152,6 +152,9 @@ class CudaPlatformBase(Platform):
                if cls.is_device_capability(100):
                    # Blackwell => Force CutlassMLA.
                    use_cutlass_mla = True
+                    # TODO: This does not work, because the
+                    # global_force_attn_backend_context_manager is not set.
+                    # See vllm/attention/selector.py:_cached_get_attn_backend
                    envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA"
                else:
                    # Not Blackwell
@@ -217,7 +220,9 @@ class CudaPlatformBase(Platform):
        if use_mla:
            # TODO(lucas): refactor to be more concise
            #  we should probably consider factoring out V1 here
-            if selected_backend == _Backend.CUTLASS_MLA:
+            if selected_backend == _Backend.CUTLASS_MLA or (
+                    cls.is_device_capability(100) and selected_backend is None
+                    and block_size == 128):
                if use_v1:
                    logger.info_once("Using Cutlass MLA backend on V1 engine.")
                    return ("vllm.v1.attention.backends.mla."