Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
91a066ec
Unverified
Commit
91a066ec
authored
Jun 17, 2025
by
fzyzcjy
Committed by
GitHub
Jun 16, 2025
Browse files
Tiny remove comments about DeepEP on H20 (#7234)
parent
c4943867
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
32 deletions
+0
-32
python/sglang/srt/layers/moe/ep_moe/token_dispatcher.py
python/sglang/srt/layers/moe/ep_moe/token_dispatcher.py
+0
-32
No files found.
python/sglang/srt/layers/moe/ep_moe/token_dispatcher.py
View file @
91a066ec
...
@@ -542,38 +542,6 @@ class _DeepEPDispatcherImplLowLatency(_DeepEPDispatcherImplBase):
...
@@ -542,38 +542,6 @@ class _DeepEPDispatcherImplLowLatency(_DeepEPDispatcherImplBase):
topk_idx
:
torch
.
Tensor
,
topk_idx
:
torch
.
Tensor
,
use_fp8
:
bool
=
False
,
use_fp8
:
bool
=
False
,
):
):
"""
# For H20, there will be an CUDA error: DeepEP/csrc/kernels/internode_ll.cu:337 'too many blocks in cooperative launch'.
# Please make sure to change DeepEP code in internode_ll.cu dispatch / combine as below first and then reinstall.
# More details refer: https://github.com/deepseek-ai/DeepEP/issues/15#issuecomment-2709715782
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
index 76ae2e2..8ecd08f 100644
--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -310,8 +310,8 @@ void dispatch(void* packed_recv_x, float* packed_recv_x_scales,
int num_topk, int num_experts, int rank, int num_ranks, bool use_fp8,
void* workspace, cudaStream_t stream, int phases) {
constexpr int kNumMaxTopK = 9;
- constexpr int kNumWarpsPerGroup = 10;
- constexpr int kNumWarpGroups = 3;
+ constexpr int kNumWarpsPerGroup = 8;
+ constexpr int kNumWarpGroups = 4;
EP_STATIC_ASSERT(kNumMaxTopK + 1 <= kNumWarpGroups * kNumWarpsPerGroup, "Too many top-k selections");
const auto num_warps = kNumWarpGroups * kNumWarpsPerGroup;
@@ -501,8 +501,8 @@ void combine(void* combined_x,
int num_combined_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
int num_topk, int num_experts, int rank, int num_ranks,
void* workspace, cudaStream_t stream, int phases) {
- constexpr int kNumWarpsPerGroup = 10;
- constexpr int kNumWarpGroups = 3;
+ constexpr int kNumWarpsPerGroup = 8;
+ constexpr int kNumWarpGroups = 4;
constexpr int kNumMaxTopk = 9;
const auto num_warps = kNumWarpGroups * kNumWarpsPerGroup;
"""
buffer
=
self
.
_get_buffer
()
buffer
=
self
.
_get_buffer
()
packed_recv_hidden
,
packed_recv_count
,
self
.
handle
,
event
,
hook
=
(
packed_recv_hidden
,
packed_recv_count
,
self
.
handle
,
event
,
hook
=
(
buffer
.
low_latency_dispatch
(
buffer
.
low_latency_dispatch
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment