Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
d389bedf
Unverified
Commit
d389bedf
authored
Jul 09, 2025
by
jianan-gu
Committed by
GitHub
Jul 09, 2025
Browse files
[CPU][Qwen3 MoE] Enable fused_topk CPU fusion and enhance FP8 TP padding (#7838)
parent
ac80f4da
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
26 additions
and
4 deletions
+26
-4
python/sglang/srt/layers/moe/topk.py
python/sglang/srt/layers/moe/topk.py
+7
-1
python/sglang/srt/layers/parameter.py
python/sglang/srt/layers/parameter.py
+19
-3
No files found.
python/sglang/srt/layers/moe/topk.py
View file @
d389bedf
...
@@ -83,13 +83,18 @@ def fused_topk_cpu(
...
@@ -83,13 +83,18 @@ def fused_topk_cpu(
gating_output
:
torch
.
Tensor
,
gating_output
:
torch
.
Tensor
,
topk
:
int
,
topk
:
int
,
renormalize
:
bool
,
renormalize
:
bool
,
num_token_non_padded
:
Optional
[
torch
.
Tensor
]
=
None
,
expert_location_dispatch_info
:
Optional
[
ExpertLocationDispatchInfo
]
=
None
,
):
):
return
torch
.
ops
.
sgl_kernel
.
topk_softmax_cpu
(
topk_weights
,
topk_ids
=
torch
.
ops
.
sgl_kernel
.
topk_softmax_cpu
(
hidden_states
=
hidden_states
,
hidden_states
=
hidden_states
,
gating_output
=
gating_output
,
gating_output
=
gating_output
,
topk
=
topk
,
topk
=
topk
,
renormalize
=
renormalize
,
renormalize
=
renormalize
,
)
)
topk_ids
=
topk_ids_logical_to_physical
(
topk_ids
,
expert_location_dispatch_info
)
_mask_topk_ids_padded_region
(
topk_ids
,
num_token_non_padded
)
return
topk_weights
,
topk_ids
def
fused_topk
(
def
fused_topk
(
...
@@ -411,6 +416,7 @@ if _is_cpu and _is_cpu_amx_available:
...
@@ -411,6 +416,7 @@ if _is_cpu and _is_cpu_amx_available:
biased_grouped_topk
=
biased_grouped_topk_cpu
biased_grouped_topk
=
biased_grouped_topk_cpu
grouped_topk
=
grouped_topk_cpu
grouped_topk
=
grouped_topk_cpu
fused_topk_native
=
fused_topk_cpu
fused_topk_native
=
fused_topk_cpu
fused_topk
=
fused_topk_cpu
else
:
else
:
biased_grouped_topk
=
biased_grouped_topk_gpu
biased_grouped_topk
=
biased_grouped_topk_gpu
grouped_topk
=
grouped_topk_gpu
grouped_topk
=
grouped_topk_gpu
...
...
python/sglang/srt/layers/parameter.py
View file @
d389bedf
...
@@ -187,6 +187,22 @@ class _ColumnvLLMParameter(BasevLLMParameter):
...
@@ -187,6 +187,22 @@ class _ColumnvLLMParameter(BasevLLMParameter):
param_data
=
self
.
data
param_data
=
self
.
data
shard_id
=
tp_rank
if
shard_id
==
"q"
else
tp_rank
//
num_heads
shard_id
=
tp_rank
if
shard_id
==
"q"
else
tp_rank
//
num_heads
param_data
=
param_data
.
narrow
(
self
.
output_dim
,
shard_offset
,
shard_size
)
param_data
=
param_data
.
narrow
(
self
.
output_dim
,
shard_offset
,
shard_size
)
if
_is_cpu
:
from
sglang.srt.model_loader.weight_utils
import
(
narrow_padded_param_and_loaded_weight
,
)
param_data
,
loaded_weight
=
narrow_padded_param_and_loaded_weight
(
param_data
,
loaded_weight
,
0
,
# param_data_start
shard_id
*
shard_size
,
self
.
output_dim
,
shard_size
,
not
use_presharded_weights
,
)
else
:
if
not
use_presharded_weights
:
if
not
use_presharded_weights
:
loaded_weight
=
loaded_weight
.
narrow
(
loaded_weight
=
loaded_weight
.
narrow
(
self
.
output_dim
,
shard_id
*
shard_size
,
shard_size
self
.
output_dim
,
shard_id
*
shard_size
,
shard_size
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment