Fix Flashinfer Backend for SM120 Usage (#12325)

a1816187 · weiliang · GitHub · e39628fd · a1816187 · a1816187
Unverified Commit a1816187 authored Oct 30, 2025 by weiliang Committed by GitHub Oct 29, 2025
2 changed files
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -26,8 +26,8 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMo
 from sglang.srt.speculative.spec_info import SpecInput
 from sglang.srt.utils import (
    get_int_env_var,
-    is_blackwell_supported,
    is_flashinfer_available,
+    is_sm100_supported,
    next_power_of_2,
 )

@@ -229,7 +229,7 @@ class FlashInferAttnBackend(AttentionBackend):
            ]

        fmha_backend = "auto"
-        if is_blackwell_supported():
+        if is_sm100_supported():
            # Disable CUTLASS backend when piecewise cuda graph is enabled
            # due to TMA descriptor initialization issues on B200
            if model_runner.server_args.enable_piecewise_cuda_graph:

--- a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
@@ -25,8 +25,8 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMo
 from sglang.srt.server_args import get_global_server_args
 from sglang.srt.speculative.spec_info import SpecInput
 from sglang.srt.utils import (
-    is_blackwell_supported,
    is_flashinfer_available,
+    is_sm100_supported,
    next_power_of_2,
 )

@@ -242,9 +242,11 @@ class FlashInferMLAAttnBackend(AttentionBackend):
        else:
            self.q_indptr_decode = q_indptr_decode_buf

-        self.fmha_backend = "auto"
-        if is_blackwell_supported():
+        if is_sm100_supported():
            self.fmha_backend = "cutlass"
+        else:
+            self.fmha_backend = "auto"
+
        self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
            self.workspace_buffer, "NHD", backend=self.fmha_backend
        )