Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
69fe3c97
Unverified
Commit
69fe3c97
authored
Oct 18, 2025
by
Zilin Zhu
Committed by
GitHub
Oct 18, 2025
Browse files
Manually flip deepep_mode for cuda_graph (#11666)
parent
8af84912
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
49 additions
and
0 deletions
+49
-0
python/sglang/srt/layers/moe/token_dispatcher/deepep.py
python/sglang/srt/layers/moe/token_dispatcher/deepep.py
+9
-0
python/sglang/srt/model_executor/cuda_graph_runner.py
python/sglang/srt/model_executor/cuda_graph_runner.py
+28
-0
python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
...n/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
+6
-0
python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
...g/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
+6
-0
No files found.
python/sglang/srt/layers/moe/token_dispatcher/deepep.py
View file @
69fe3c97
...
@@ -235,6 +235,15 @@ class DeepEPBuffer:
...
@@ -235,6 +235,15 @@ class DeepEPBuffer:
cls
.
clean_buffer
()
cls
.
clean_buffer
()
cls
.
_dispatch_mode
=
DeepEPDispatchMode
.
LOW_LATENCY
cls
.
_dispatch_mode
=
DeepEPDispatchMode
.
LOW_LATENCY
@
classmethod
def
set_dispatch_mode
(
cls
,
mode
:
DeepEPMode
):
if
mode
.
is_low_latency
():
cls
.
set_dispatch_mode_as_low_latency
()
elif
mode
.
is_normal
():
cls
.
set_dispatch_mode_as_normal
()
else
:
raise
Exception
(
"unsupported mode"
)
class
DeepEPConfig
(
BaseDispatcherConfig
):
class
DeepEPConfig
(
BaseDispatcherConfig
):
_instance
=
None
_instance
=
None
...
...
python/sglang/srt/model_executor/cuda_graph_runner.py
View file @
69fe3c97
...
@@ -40,6 +40,8 @@ from sglang.srt.layers.dp_attention import (
...
@@ -40,6 +40,8 @@ from sglang.srt.layers.dp_attention import (
set_dp_buffer_len
,
set_dp_buffer_len
,
)
)
from
sglang.srt.layers.logits_processor
import
LogitsProcessorOutput
from
sglang.srt.layers.logits_processor
import
LogitsProcessorOutput
from
sglang.srt.layers.moe.token_dispatcher.deepep
import
DeepEPBuffer
from
sglang.srt.layers.moe.utils
import
get_deepep_mode
,
get_moe_a2a_backend
from
sglang.srt.layers.torchao_utils
import
save_gemlite_cache
from
sglang.srt.layers.torchao_utils
import
save_gemlite_cache
from
sglang.srt.model_executor.forward_batch_info
import
(
from
sglang.srt.model_executor.forward_batch_info
import
(
CaptureHiddenMode
,
CaptureHiddenMode
,
...
@@ -240,6 +242,8 @@ class CudaGraphRunner:
...
@@ -240,6 +242,8 @@ class CudaGraphRunner:
self
.
attn_tp_size
=
get_attention_tp_size
()
self
.
attn_tp_size
=
get_attention_tp_size
()
self
.
attn_tp_rank
=
get_attention_tp_rank
()
self
.
attn_tp_rank
=
get_attention_tp_rank
()
self
.
deepep_adapter
=
DeepEPCudaGraphRunnerAdapter
()
# Batch sizes to capture
# Batch sizes to capture
self
.
capture_bs
,
self
.
compile_bs
=
get_batch_sizes_to_capture
(
model_runner
)
self
.
capture_bs
,
self
.
compile_bs
=
get_batch_sizes_to_capture
(
model_runner
)
log_info_on_rank0
(
logger
,
f
"Capture cuda graph bs
{
self
.
capture_bs
}
"
)
log_info_on_rank0
(
logger
,
f
"Capture cuda graph bs
{
self
.
capture_bs
}
"
)
...
@@ -653,6 +657,8 @@ class CudaGraphRunner:
...
@@ -653,6 +657,8 @@ class CudaGraphRunner:
)
)
return
logits_output_or_pp_proxy_tensors
return
logits_output_or_pp_proxy_tensors
self
.
deepep_adapter
.
capture
(
is_extend_in_batch
=
False
)
for
_
in
range
(
2
):
for
_
in
range
(
2
):
self
.
device_module
.
synchronize
()
self
.
device_module
.
synchronize
()
self
.
model_runner
.
tp_group
.
barrier
()
self
.
model_runner
.
tp_group
.
barrier
()
...
@@ -796,6 +802,8 @@ class CudaGraphRunner:
...
@@ -796,6 +802,8 @@ class CudaGraphRunner:
skip_attn_backend_init
:
bool
=
False
,
skip_attn_backend_init
:
bool
=
False
,
pp_proxy_tensors
:
Optional
[
PPProxyTensors
]
=
None
,
pp_proxy_tensors
:
Optional
[
PPProxyTensors
]
=
None
,
)
->
Union
[
LogitsProcessorOutput
,
PPProxyTensors
]:
)
->
Union
[
LogitsProcessorOutput
,
PPProxyTensors
]:
self
.
deepep_adapter
.
replay
()
if
not
skip_attn_backend_init
:
if
not
skip_attn_backend_init
:
self
.
replay_prepare
(
forward_batch
,
pp_proxy_tensors
)
self
.
replay_prepare
(
forward_batch
,
pp_proxy_tensors
)
else
:
else
:
...
@@ -872,3 +880,23 @@ CUDA_GRAPH_CAPTURE_FAILED_MSG = (
...
@@ -872,3 +880,23 @@ CUDA_GRAPH_CAPTURE_FAILED_MSG = (
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)
\n
"
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)
\n
"
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose
\n
"
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose
\n
"
)
)
class
DeepEPCudaGraphRunnerAdapter
:
def
__init__
(
self
):
# Record DeepEP mode used during capture to ensure replay consistency
self
.
_captured_deepep_mode
=
None
def
capture
(
self
,
is_extend_in_batch
:
bool
):
if
not
get_moe_a2a_backend
().
is_deepep
():
return
self
.
_captured_deepep_mode
=
get_deepep_mode
().
resolve
(
is_extend_in_batch
=
is_extend_in_batch
)
DeepEPBuffer
.
set_dispatch_mode
(
self
.
_captured_deepep_mode
)
def
replay
(
self
):
if
not
get_moe_a2a_backend
().
is_deepep
():
return
assert
self
.
_captured_deepep_mode
is
not
None
DeepEPBuffer
.
set_dispatch_mode
(
self
.
_captured_deepep_mode
)
python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
View file @
69fe3c97
...
@@ -9,6 +9,7 @@ from sglang.srt.layers.dp_attention import DpPaddingMode, set_dp_buffer_len
...
@@ -9,6 +9,7 @@ from sglang.srt.layers.dp_attention import DpPaddingMode, set_dp_buffer_len
from
sglang.srt.model_executor.cuda_graph_runner
import
(
from
sglang.srt.model_executor.cuda_graph_runner
import
(
CUDA_GRAPH_CAPTURE_FAILED_MSG
,
CUDA_GRAPH_CAPTURE_FAILED_MSG
,
CudaGraphRunner
,
CudaGraphRunner
,
DeepEPCudaGraphRunnerAdapter
,
get_batch_sizes_to_capture
,
get_batch_sizes_to_capture
,
get_global_graph_memory_pool
,
get_global_graph_memory_pool
,
model_capture_mode
,
model_capture_mode
,
...
@@ -61,6 +62,7 @@ class EAGLEDraftCudaGraphRunner:
...
@@ -61,6 +62,7 @@ class EAGLEDraftCudaGraphRunner:
self
.
enable_profile_cuda_graph
=
(
self
.
enable_profile_cuda_graph
=
(
model_runner
.
server_args
.
enable_profile_cuda_graph
model_runner
.
server_args
.
enable_profile_cuda_graph
)
)
self
.
deepep_adapter
=
DeepEPCudaGraphRunnerAdapter
()
server_args
=
model_runner
.
server_args
server_args
=
model_runner
.
server_args
# Batch sizes to capture
# Batch sizes to capture
...
@@ -264,6 +266,8 @@ class EAGLEDraftCudaGraphRunner:
...
@@ -264,6 +266,8 @@ class EAGLEDraftCudaGraphRunner:
forward_batch
.
spec_info
.
hidden_states
=
hidden_states_backup
forward_batch
.
spec_info
.
hidden_states
=
hidden_states_backup
return
ret
return
ret
self
.
deepep_adapter
.
capture
(
is_extend_in_batch
=
False
)
for
_
in
range
(
2
):
for
_
in
range
(
2
):
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
self
.
model_runner
.
tp_group
.
barrier
()
self
.
model_runner
.
tp_group
.
barrier
()
...
@@ -285,6 +289,8 @@ class EAGLEDraftCudaGraphRunner:
...
@@ -285,6 +289,8 @@ class EAGLEDraftCudaGraphRunner:
def
replay
(
self
,
forward_batch
:
ForwardBatch
):
def
replay
(
self
,
forward_batch
:
ForwardBatch
):
assert
forward_batch
.
out_cache_loc
is
not
None
assert
forward_batch
.
out_cache_loc
is
not
None
self
.
deepep_adapter
.
replay
()
raw_bs
=
forward_batch
.
batch_size
raw_bs
=
forward_batch
.
batch_size
raw_num_token
=
raw_bs
*
self
.
num_tokens_per_bs
raw_num_token
=
raw_bs
*
self
.
num_tokens_per_bs
...
...
python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
View file @
69fe3c97
...
@@ -9,6 +9,7 @@ from sglang.srt.layers.dp_attention import DpPaddingMode, set_dp_buffer_len
...
@@ -9,6 +9,7 @@ from sglang.srt.layers.dp_attention import DpPaddingMode, set_dp_buffer_len
from
sglang.srt.model_executor.cuda_graph_runner
import
(
from
sglang.srt.model_executor.cuda_graph_runner
import
(
CUDA_GRAPH_CAPTURE_FAILED_MSG
,
CUDA_GRAPH_CAPTURE_FAILED_MSG
,
CudaGraphRunner
,
CudaGraphRunner
,
DeepEPCudaGraphRunnerAdapter
,
LogitsProcessorOutput
,
LogitsProcessorOutput
,
get_batch_sizes_to_capture
,
get_batch_sizes_to_capture
,
get_global_graph_memory_pool
,
get_global_graph_memory_pool
,
...
@@ -61,6 +62,7 @@ class EAGLEDraftExtendCudaGraphRunner:
...
@@ -61,6 +62,7 @@ class EAGLEDraftExtendCudaGraphRunner:
)
)
self
.
capture_bs
,
self
.
compile_bs
=
get_batch_sizes_to_capture
(
model_runner
)
self
.
capture_bs
,
self
.
compile_bs
=
get_batch_sizes_to_capture
(
model_runner
)
self
.
padded_static_len
=
-
1
self
.
padded_static_len
=
-
1
self
.
deepep_adapter
=
DeepEPCudaGraphRunnerAdapter
()
# Attention backend
# Attention backend
self
.
num_tokens_per_bs
=
self
.
speculative_num_steps
+
1
self
.
num_tokens_per_bs
=
self
.
speculative_num_steps
+
1
...
@@ -243,6 +245,8 @@ class EAGLEDraftExtendCudaGraphRunner:
...
@@ -243,6 +245,8 @@ class EAGLEDraftExtendCudaGraphRunner:
)
)
spec_info
.
positions
=
None
spec_info
.
positions
=
None
self
.
deepep_adapter
.
capture
(
is_extend_in_batch
=
True
)
# Forward batch
# Forward batch
forward_batch
=
ForwardBatch
(
forward_batch
=
ForwardBatch
(
forward_mode
=
ForwardMode
.
DRAFT_EXTEND
,
forward_mode
=
ForwardMode
.
DRAFT_EXTEND
,
...
@@ -318,6 +322,8 @@ class EAGLEDraftExtendCudaGraphRunner:
...
@@ -318,6 +322,8 @@ class EAGLEDraftExtendCudaGraphRunner:
def
replay
(
self
,
forward_batch
:
ForwardBatch
):
def
replay
(
self
,
forward_batch
:
ForwardBatch
):
assert
forward_batch
.
out_cache_loc
is
not
None
assert
forward_batch
.
out_cache_loc
is
not
None
self
.
deepep_adapter
.
replay
()
# batch_size and num_seqs can be different in case there are finished examples
# batch_size and num_seqs can be different in case there are finished examples
# in the batch, which will not be counted as num_seqs
# in the batch, which will not be counted as num_seqs
raw_bs
=
forward_batch
.
batch_size
raw_bs
=
forward_batch
.
batch_size
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment