Unverified Commit 84701338 authored by Xiaoyu Zhang's avatar Xiaoyu Zhang Committed by GitHub
Browse files

[b200] fix piecewise cuda graph launch bug (#12067)

parent 93ef9a09
......@@ -230,7 +230,16 @@ class FlashInferAttnBackend(AttentionBackend):
fmha_backend = "auto"
if is_sm100_supported():
fmha_backend = "cutlass"
# Disable CUTLASS backend when piecewise cuda graph is enabled
# due to TMA descriptor initialization issues on B200
if model_runner.server_args.enable_piecewise_cuda_graph:
logger.warning(
"CUTLASS backend is disabled when piecewise cuda graph is enabled "
"due to TMA descriptor initialization issues on B200. "
"Using auto backend instead for stability."
)
else:
fmha_backend = "cutlass"
self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
self.workspace_buffer, "NHD", backend=fmha_backend
)
......
......@@ -250,6 +250,9 @@ class PiecewiseCudaGraphRunner:
lora_ids=None,
)
# Attention backend
self.model_runner.attn_backend.init_forward_metadata(forward_batch)
with set_forward_context(forward_batch, self.attention_layers):
_ = self.model_runner.model.forward(
forward_batch.input_ids,
......@@ -375,9 +378,6 @@ class PiecewiseCudaGraphRunner:
if lora_ids is not None:
self.model_runner.lora_manager.prepare_lora_batch(forward_batch)
# # Attention backend
self.model_runner.attn_backend.init_forward_metadata(forward_batch)
# Run and capture
def run_once():
# Clean intermediate result cache for DP attention
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment