Unverified Commit 84701338 authored by Xiaoyu Zhang's avatar Xiaoyu Zhang Committed by GitHub
Browse files

[b200] fix piecewise cuda graph launch bug (#12067)

parent 93ef9a09
...@@ -230,7 +230,16 @@ class FlashInferAttnBackend(AttentionBackend): ...@@ -230,7 +230,16 @@ class FlashInferAttnBackend(AttentionBackend):
fmha_backend = "auto" fmha_backend = "auto"
if is_sm100_supported(): if is_sm100_supported():
fmha_backend = "cutlass" # Disable CUTLASS backend when piecewise cuda graph is enabled
# due to TMA descriptor initialization issues on B200
if model_runner.server_args.enable_piecewise_cuda_graph:
logger.warning(
"CUTLASS backend is disabled when piecewise cuda graph is enabled "
"due to TMA descriptor initialization issues on B200. "
"Using auto backend instead for stability."
)
else:
fmha_backend = "cutlass"
self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper( self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
self.workspace_buffer, "NHD", backend=fmha_backend self.workspace_buffer, "NHD", backend=fmha_backend
) )
......
...@@ -250,6 +250,9 @@ class PiecewiseCudaGraphRunner: ...@@ -250,6 +250,9 @@ class PiecewiseCudaGraphRunner:
lora_ids=None, lora_ids=None,
) )
# Attention backend
self.model_runner.attn_backend.init_forward_metadata(forward_batch)
with set_forward_context(forward_batch, self.attention_layers): with set_forward_context(forward_batch, self.attention_layers):
_ = self.model_runner.model.forward( _ = self.model_runner.model.forward(
forward_batch.input_ids, forward_batch.input_ids,
...@@ -375,9 +378,6 @@ class PiecewiseCudaGraphRunner: ...@@ -375,9 +378,6 @@ class PiecewiseCudaGraphRunner:
if lora_ids is not None: if lora_ids is not None:
self.model_runner.lora_manager.prepare_lora_batch(forward_batch) self.model_runner.lora_manager.prepare_lora_batch(forward_batch)
# # Attention backend
self.model_runner.attn_backend.init_forward_metadata(forward_batch)
# Run and capture # Run and capture
def run_once(): def run_once():
# Clean intermediate result cache for DP attention # Clean intermediate result cache for DP attention
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment