Unverified Commit b42566f4 authored by Wentao Ye's avatar Wentao Ye Committed by GitHub
Browse files

[Bug] Fix `is_flashmla_supported` Check Error (#24774)


Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
parent d96e1116
......@@ -17,7 +17,6 @@ from vllm.attention.backends.mla.common import (MLACommonBackend,
from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
get_mla_metadata,
is_flashmla_supported)
from vllm.platforms.cuda import CudaPlatform
class FlashMLABackend(MLACommonBackend):
......@@ -179,18 +178,8 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
logits_soft_cap, attn_type,
kv_sharing_target_layer_name, **mla_args)
assert is_flashmla_supported(), \
"FlashMLA is not supported on this device"
# disallow FlashMLA on NVIDIA Blackwell (SM 10.0+) GPUs
# context:
# https://github.com/deepseek-ai/FlashMLA/issues/83
# https://github.com/vllm-project/vllm/issues/24513
if CudaPlatform.has_device_capability(100):
raise NotImplementedError(
"FlashMLA is temporarily disabled on Blackwell (SM 10.0). "
"Please use CUTLASS_MLA or TRITON_MLA instead. "
"Example: `export VLLM_ATTENTION_BACKEND=CUTLASS_MLA`")
is_supported, reason = is_flashmla_supported()
assert is_supported, reason
unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
if any(unsupported_features):
......
......@@ -12,7 +12,6 @@ from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
is_flashmla_supported)
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.platforms.cuda import CudaPlatform
from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
MLACommonDecodeMetadata,
MLACommonImpl,
......@@ -156,18 +155,8 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
logits_soft_cap, attn_type,
kv_sharing_target_layer_name, **mla_args)
assert is_flashmla_supported(), \
"FlashMLA is not supported on this device"
# disallow FlashMLA on NVIDIA Blackwell (SM 10.0+) GPUs
# context:
# https://github.com/deepseek-ai/FlashMLA/issues/83
# https://github.com/vllm-project/vllm/issues/24513
if CudaPlatform.has_device_capability(100):
raise NotImplementedError(
"FlashMLA is temporarily disabled on Blackwell (SM 10.0). "
"Please use CUTLASS_MLA or TRITON_MLA instead. "
"Example: `export VLLM_ATTENTION_BACKEND=CUTLASS_MLA`")
is_supported, reason = is_flashmla_supported()
assert is_supported, reason
unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap]
if any(unsupported_features):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment