Unverified Commit b1361c72 authored by Michael Goin's avatar Michael Goin Committed by GitHub
Browse files

[Bugfix] Fix default enable for CUTLASS MLA on SM100 (#22738)


Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
parent 4f0f844b
...@@ -152,6 +152,9 @@ class CudaPlatformBase(Platform): ...@@ -152,6 +152,9 @@ class CudaPlatformBase(Platform):
if cls.is_device_capability(100): if cls.is_device_capability(100):
# Blackwell => Force CutlassMLA. # Blackwell => Force CutlassMLA.
use_cutlass_mla = True use_cutlass_mla = True
# TODO: This does not work, because the
# global_force_attn_backend_context_manager is not set.
# See vllm/attention/selector.py:_cached_get_attn_backend
envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA" envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA"
else: else:
# Not Blackwell # Not Blackwell
...@@ -217,7 +220,9 @@ class CudaPlatformBase(Platform): ...@@ -217,7 +220,9 @@ class CudaPlatformBase(Platform):
if use_mla: if use_mla:
# TODO(lucas): refactor to be more concise # TODO(lucas): refactor to be more concise
# we should probably consider factoring out V1 here # we should probably consider factoring out V1 here
if selected_backend == _Backend.CUTLASS_MLA: if selected_backend == _Backend.CUTLASS_MLA or (
cls.is_device_capability(100) and selected_backend is None
and block_size == 128):
if use_v1: if use_v1:
logger.info_once("Using Cutlass MLA backend on V1 engine.") logger.info_once("Using Cutlass MLA backend on V1 engine.")
return ("vllm.v1.attention.backends.mla." return ("vllm.v1.attention.backends.mla."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment