[BugFix] Fix MLA + V1 + TP==1 causing reinitialization of cuda context (#14910)

1e799b7e · Lucas Wilkinson · GitHub · 7f6c5ee0 · 1e799b7e
Unverified Commit 1e799b7e authored Mar 16, 2025 by Lucas Wilkinson Committed by GitHub Mar 17, 2025
Show whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

vllm/platforms/cuda.py vllm/platforms/cuda.py +1 -1

No files found.
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -152,7 +152,7 @@ class CudaPlatformBase(Platform):
            # here
            use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
                or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
-            from vllm.attention.backends.flashmla import is_flashmla_supported
+            from vllm.attention.ops.flashmla import is_flashmla_supported
            if use_flashmla and is_flashmla_supported()[0] \
                and cache_config.block_size != 64:
                cache_config.block_size = 64