Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
84701338
Unverified
Commit
84701338
authored
Oct 24, 2025
by
Xiaoyu Zhang
Committed by
GitHub
Oct 24, 2025
Browse files
[b200] fix piecewise cuda graph launch bug (#12067)
parent
93ef9a09
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
13 additions
and
4 deletions
+13
-4
python/sglang/srt/layers/attention/flashinfer_backend.py
python/sglang/srt/layers/attention/flashinfer_backend.py
+10
-1
python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py
.../sglang/srt/model_executor/piecewise_cuda_graph_runner.py
+3
-3
No files found.
python/sglang/srt/layers/attention/flashinfer_backend.py
View file @
84701338
...
...
@@ -230,7 +230,16 @@ class FlashInferAttnBackend(AttentionBackend):
fmha_backend
=
"auto"
if
is_sm100_supported
():
fmha_backend
=
"cutlass"
# Disable CUTLASS backend when piecewise cuda graph is enabled
# due to TMA descriptor initialization issues on B200
if
model_runner
.
server_args
.
enable_piecewise_cuda_graph
:
logger
.
warning
(
"CUTLASS backend is disabled when piecewise cuda graph is enabled "
"due to TMA descriptor initialization issues on B200. "
"Using auto backend instead for stability."
)
else
:
fmha_backend
=
"cutlass"
self
.
prefill_wrapper_ragged
=
BatchPrefillWithRaggedKVCacheWrapper
(
self
.
workspace_buffer
,
"NHD"
,
backend
=
fmha_backend
)
...
...
python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py
View file @
84701338
...
...
@@ -250,6 +250,9 @@ class PiecewiseCudaGraphRunner:
lora_ids
=
None
,
)
# Attention backend
self
.
model_runner
.
attn_backend
.
init_forward_metadata
(
forward_batch
)
with
set_forward_context
(
forward_batch
,
self
.
attention_layers
):
_
=
self
.
model_runner
.
model
.
forward
(
forward_batch
.
input_ids
,
...
...
@@ -375,9 +378,6 @@ class PiecewiseCudaGraphRunner:
if
lora_ids
is
not
None
:
self
.
model_runner
.
lora_manager
.
prepare_lora_batch
(
forward_batch
)
# # Attention backend
self
.
model_runner
.
attn_backend
.
init_forward_metadata
(
forward_batch
)
# Run and capture
def
run_once
():
# Clean intermediate result cache for DP attention
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment