[b200] fix piecewise cuda graph launch bug (#12067)

84701338 · Xiaoyu Zhang · GitHub · 93ef9a09 · 84701338 · 84701338
Unverified Commit 84701338 authored Oct 24, 2025 by Xiaoyu Zhang Committed by GitHub Oct 24, 2025
2 changed files
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -230,7 +230,16 @@ class FlashInferAttnBackend(AttentionBackend):

        fmha_backend = "auto"
        if is_sm100_supported():
-            fmha_backend = "cutlass"
+            # Disable CUTLASS backend when piecewise cuda graph is enabled
+            # due to TMA descriptor initialization issues on B200
+            if model_runner.server_args.enable_piecewise_cuda_graph:
+                logger.warning(
+                    "CUTLASS backend is disabled when piecewise cuda graph is enabled "
+                    "due to TMA descriptor initialization issues on B200. "
+                    "Using auto backend instead for stability."
+                )
+            else:
+                fmha_backend = "cutlass"
        self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
            self.workspace_buffer, "NHD", backend=fmha_backend
        )

--- a/python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py
@@ -250,6 +250,9 @@ class PiecewiseCudaGraphRunner:
                lora_ids=None,
            )

+        # Attention backend
+        self.model_runner.attn_backend.init_forward_metadata(forward_batch)
+
        with set_forward_context(forward_batch, self.attention_layers):
            _ = self.model_runner.model.forward(
                forward_batch.input_ids,
@@ -375,9 +378,6 @@ class PiecewiseCudaGraphRunner:
        if lora_ids is not None:
            self.model_runner.lora_manager.prepare_lora_batch(forward_batch)

-        # # Attention backend
-        self.model_runner.attn_backend.init_forward_metadata(forward_batch)
-
        # Run and capture
        def run_once():
            # Clean intermediate result cache for DP attention