Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
0ae1e9a7
"_backup/topdown/viz.py" did not exist on "8e2e68df2b18a388d84540c7fd97dcf1005ce8bf"
Unverified
Commit
0ae1e9a7
authored
Jun 16, 2025
by
Xiaoyu Zhang
Committed by
GitHub
Jun 15, 2025
Browse files
refine fused_moe benchmark (#7221)
parent
e07d0647
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
10 additions
and
102 deletions
+10
-102
benchmark/kernels/fused_moe_triton/benchmark_ep_pre_reorder_triton.py
...rnels/fused_moe_triton/benchmark_ep_pre_reorder_triton.py
+0
-101
benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py
...els/fused_moe_triton/benchmark_torch_compile_fused_moe.py
+5
-1
benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py
...d_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py
+5
-0
No files found.
benchmark/kernels/fused_moe_triton/benchmark_ep_pre_reorder_triton.py
deleted
100644 → 0
View file @
e07d0647
import
argparse
import
itertools
import
pandas
as
pd
import
torch
import
triton
from
sglang.srt.layers.moe.ep_moe.kernels
import
pre_reorder_triton_kernel
def
benchmark_pre_reorder
(
batch_size
,
topk
,
model_config
):
hidden_size
=
model_config
[
"hidden_size"
]
block_size
=
model_config
[
"block_size"
]
expert_range
=
model_config
[
"expert_range"
]
input_ptr
=
torch
.
randn
(
batch_size
,
hidden_size
,
dtype
=
torch
.
float16
,
device
=
"cuda"
)
gateup_input_ptr
=
torch
.
zeros
(
batch_size
*
topk
,
hidden_size
,
dtype
=
torch
.
float16
,
device
=
"cuda"
)
src2dst_ptr
=
torch
.
randint
(
0
,
batch_size
*
topk
,
(
batch_size
,
topk
),
dtype
=
torch
.
int32
,
device
=
"cuda"
)
topk_ids_ptr
=
torch
.
randint
(
expert_range
[
0
],
expert_range
[
1
]
+
1
,
(
batch_size
,
topk
),
dtype
=
torch
.
int32
,
device
=
"cuda"
,
)
a1_scales_ptr
=
torch
.
rand
(
expert_range
[
1
]
-
expert_range
[
0
]
+
1
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
input_ptr
=
input_ptr
.
view
(
-
1
)
gateup_input_ptr
=
gateup_input_ptr
.
view
(
-
1
)
src2dst_ptr
=
src2dst_ptr
.
view
(
-
1
)
topk_ids_ptr
=
topk_ids_ptr
.
view
(
-
1
)
def
run_kernel
():
pre_reorder_triton_kernel
[(
batch_size
,)](
input_ptr
,
gateup_input_ptr
,
src2dst_ptr
,
topk_ids_ptr
,
a1_scales_ptr
,
expert_range
[
0
],
expert_range
[
1
],
topk
,
hidden_size
,
block_size
,
use_per_token_if_dynamic
=
True
,
)
for
_
in
range
(
10
):
run_kernel
()
torch
.
cuda
.
synchronize
()
ms
,
_
,
_
=
triton
.
testing
.
do_bench
(
run_kernel
,
quantiles
=
[
0.5
,
0.2
,
0.8
])
return
ms
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--hidden-size"
,
type
=
int
,
required
=
True
)
parser
.
add_argument
(
"--block-size"
,
type
=
int
,
default
=
512
)
args
=
parser
.
parse_args
()
model_config
=
{
"hidden_size"
:
args
.
hidden_size
,
"block_size"
:
args
.
block_size
,
"expert_range"
:
(
0
,
255
),
}
batch_sizes
=
[
64
,
128
,
256
,
512
,
640
,
768
,
1024
]
topks
=
[
2
,
4
,
8
]
configs
=
list
(
itertools
.
product
(
batch_sizes
,
topks
))
# Prepare results dict: keys = topk, each row is indexed by batch_size
results_dict
=
{
topk
:
{}
for
topk
in
topks
}
for
batch_size
,
topk
in
configs
:
ms
=
benchmark_pre_reorder
(
batch_size
,
topk
,
model_config
)
results_dict
[
topk
][
batch_size
]
=
ms
# Build dataframe
df
=
pd
.
DataFrame
(
{
"batch_size"
:
batch_sizes
,
**
{
f
"TopK=
{
topk
}
"
:
[
results_dict
[
topk
].
get
(
bs
,
None
)
for
bs
in
batch_sizes
]
for
topk
in
topks
},
}
)
print
(
"
\n
pre-reorder-performance:"
)
print
(
df
.
to_string
(
index
=
False
,
float_format
=
"%.6f"
))
if
__name__
==
"__main__"
:
main
()
benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py
View file @
0ae1e9a7
...
@@ -37,11 +37,15 @@ def get_model_config(model_name: str, tp_size: int):
...
@@ -37,11 +37,15 @@ def get_model_config(model_name: str, tp_size: int):
intermediate_size
=
config
.
moe_intermediate_size
intermediate_size
=
config
.
moe_intermediate_size
shard_intermediate_size
=
2
*
intermediate_size
//
tp_size
shard_intermediate_size
=
2
*
intermediate_size
//
tp_size
elif
config
.
architectures
[
0
]
in
[
"DeepseekV2ForCausalLM"
,
"DeepseekV3ForCausalLM"
]:
elif
config
.
architectures
[
0
]
in
[
"DeepseekV2ForCausalLM"
,
"DeepseekV3ForCausalLM"
]:
E
=
config
.
n_routed_experts
E
=
config
.
n_routed_experts
topk
=
config
.
num_experts_per_tok
topk
=
config
.
num_experts_per_tok
intermediate_size
=
config
.
moe_intermediate_size
intermediate_size
=
config
.
moe_intermediate_size
shard_intermediate_size
=
2
*
intermediate_size
//
tp_size
shard_intermediate_size
=
2
*
intermediate_size
//
tp_size
elif
config
.
architectures
[
0
]
==
"Llama4ForConditionalGeneration"
:
E
=
config
.
text_config
.
num_local_experts
topk
=
config
.
text_config
.
num_experts_per_tok
intermediate_size
=
config
.
text_config
.
intermediate_size
shard_intermediate_size
=
2
*
intermediate_size
//
tp_size
elif
config
.
architectures
[
0
]
in
[
elif
config
.
architectures
[
0
]
in
[
"Grok1ForCausalLM"
,
"Grok1ForCausalLM"
,
"Grok1ImgGen"
,
"Grok1ImgGen"
,
...
...
benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py
View file @
0ae1e9a7
...
@@ -51,6 +51,11 @@ def get_model_config(model_name: str, tp_size: int):
...
@@ -51,6 +51,11 @@ def get_model_config(model_name: str, tp_size: int):
topk
=
config
.
num_experts_per_tok
topk
=
config
.
num_experts_per_tok
intermediate_size
=
config
.
moe_intermediate_size
intermediate_size
=
config
.
moe_intermediate_size
shard_intermediate_size
=
2
*
intermediate_size
//
tp_size
shard_intermediate_size
=
2
*
intermediate_size
//
tp_size
elif
config
.
architectures
[
0
]
==
"Llama4ForConditionalGeneration"
:
E
=
config
.
text_config
.
num_local_experts
topk
=
config
.
text_config
.
num_experts_per_tok
intermediate_size
=
config
.
text_config
.
intermediate_size
shard_intermediate_size
=
2
*
intermediate_size
//
tp_size
elif
config
.
architectures
[
0
]
in
[
elif
config
.
architectures
[
0
]
in
[
"Grok1ForCausalLM"
,
"Grok1ForCausalLM"
,
"Grok1ImgGen"
,
"Grok1ImgGen"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment