Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
ce399e15
Unverified
Commit
ce399e15
authored
Oct 19, 2025
by
fzyzcjy
Committed by
GitHub
Oct 19, 2025
Browse files
Make single-batch overlap compatible with NextN (#11804)
parent
ea6275df
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
9 additions
and
4 deletions
+9
-4
python/sglang/srt/layers/moe/ep_moe/layer.py
python/sglang/srt/layers/moe/ep_moe/layer.py
+2
-0
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+2
-0
python/sglang/srt/single_batch_overlap.py
python/sglang/srt/single_batch_overlap.py
+5
-4
No files found.
python/sglang/srt/layers/moe/ep_moe/layer.py
View file @
ce399e15
...
@@ -170,6 +170,7 @@ class DeepEPMoE(FusedMoE):
...
@@ -170,6 +170,7 @@ class DeepEPMoE(FusedMoE):
forward_batch
:
ForwardBatch
,
forward_batch
:
ForwardBatch
,
forward_shared_experts
=
None
,
forward_shared_experts
=
None
,
alt_stream
=
None
,
alt_stream
=
None
,
disable_sbo
=
False
,
):
):
# We have to call SBO inside MoE to be compatible with hooks used in offloading
# We have to call SBO inside MoE to be compatible with hooks used in offloading
return
single_batch_overlap
.
execute_sbo
(
return
single_batch_overlap
.
execute_sbo
(
...
@@ -181,6 +182,7 @@ class DeepEPMoE(FusedMoE):
...
@@ -181,6 +182,7 @@ class DeepEPMoE(FusedMoE):
experts
=
self
,
experts
=
self
,
forward_shared_experts
=
forward_shared_experts
,
forward_shared_experts
=
forward_shared_experts
,
alt_stream
=
alt_stream
,
alt_stream
=
alt_stream
,
disable_sbo
=
disable_sbo
,
)
)
def
dispatch
(
def
dispatch
(
...
...
python/sglang/srt/models/deepseek_v2.py
View file @
ce399e15
...
@@ -902,6 +902,8 @@ class DeepseekV2MoE(nn.Module):
...
@@ -902,6 +902,8 @@ class DeepseekV2MoE(nn.Module):
dict
(
dict
(
forward_shared_experts
=
_forward_shared_experts_and_put_results
,
forward_shared_experts
=
_forward_shared_experts_and_put_results
,
alt_stream
=
self
.
alt_stream
,
alt_stream
=
self
.
alt_stream
,
# SBO is not yet implemented for NextN
disable_sbo
=
self
.
is_nextn
,
)
)
if
self
.
_fuse_shared_experts_inside_sbo
if
self
.
_fuse_shared_experts_inside_sbo
else
{}
else
{}
...
...
python/sglang/srt/single_batch_overlap.py
View file @
ce399e15
...
@@ -60,13 +60,14 @@ def execute_sbo(
...
@@ -60,13 +60,14 @@ def execute_sbo(
topk_weights
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
forward_batch
:
ForwardBatch
,
forward_batch
:
ForwardBatch
,
alt_stream
:
Optional
=
None
,
alt_stream
:
Optional
=
None
,
disable_sbo
:
bool
=
False
,
):
):
dispatch_output
=
experts
.
dispatch
(
dispatch_output
=
experts
.
dispatch
(
hidden_states
,
topk_idx
,
topk_weights
,
forward_batch
hidden_states
,
topk_idx
,
topk_weights
,
forward_batch
)
)
combine_overlap_args
,
down_gemm_overlap_args
,
meta_overlap_args
=
(
combine_overlap_args
,
down_gemm_overlap_args
,
meta_overlap_args
=
(
_compute_overlap_args
(
dispatch_output
,
alt_stream
)
_compute_overlap_args
(
dispatch_output
,
alt_stream
,
disable_sbo
=
disable_sbo
)
)
)
hidden_states
=
experts
.
moe_impl
(
hidden_states
=
experts
.
moe_impl
(
...
@@ -75,7 +76,7 @@ def execute_sbo(
...
@@ -75,7 +76,7 @@ def execute_sbo(
if
(
e
:
=
meta_overlap_args
.
get
(
"record_event_after_down"
))
is
not
None
:
if
(
e
:
=
meta_overlap_args
.
get
(
"record_event_after_down"
))
is
not
None
:
e
.
record
()
e
.
record
()
if
SboFlags
.
enable_combine_shared_two_stream_overlap
():
if
(
not
disable_sbo
)
and
SboFlags
.
enable_combine_shared_two_stream_overlap
():
# TODO reduce sm for non-deepgemm
# TODO reduce sm for non-deepgemm
with
deep_gemm_wrapper
.
configure_deep_gemm_num_sms
(
with
deep_gemm_wrapper
.
configure_deep_gemm_num_sms
(
meta_overlap_args
[
"compute_num_sms"
]
meta_overlap_args
[
"compute_num_sms"
]
...
@@ -93,8 +94,8 @@ def execute_sbo(
...
@@ -93,8 +94,8 @@ def execute_sbo(
return
hidden_states
return
hidden_states
def
_compute_overlap_args
(
dispatch_output
,
alt_stream
):
def
_compute_overlap_args
(
dispatch_output
,
alt_stream
,
disable_sbo
):
if
not
(
if
disable_sbo
or
not
(
SboFlags
.
enable_combine_down_gemm_two_stream_overlap
()
SboFlags
.
enable_combine_down_gemm_two_stream_overlap
()
or
SboFlags
.
enable_combine_shared_two_stream_overlap
()
or
SboFlags
.
enable_combine_shared_two_stream_overlap
()
):
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment