Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d7b28f34
Unverified
Commit
d7b28f34
authored
Aug 04, 2025
by
Wentao Ye
Committed by
GitHub
Aug 04, 2025
Browse files
[Log] DeepGEMM Update Log for Unaligned Problem Size (#22208)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
6fa41e0c
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
23 additions
and
8 deletions
+23
-8
vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+19
-2
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+2
-4
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+2
-2
No files found.
vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
View file @
d7b28f34
...
@@ -33,7 +33,7 @@ def deep_gemm_block_shape() -> list[int]:
...
@@ -33,7 +33,7 @@ def deep_gemm_block_shape() -> list[int]:
return
[
block
,
block
]
return
[
block
,
block
]
def
_valid_deep_gemm_shape
(
M
:
int
,
N
:
int
,
K
:
int
):
def
_valid_deep_gemm_shape
(
M
:
int
,
N
:
int
,
K
:
int
)
->
bool
:
align
=
deep_gemm_block_shape
()[
0
]
align
=
deep_gemm_block_shape
()[
0
]
return
align
<=
M
and
N
%
align
==
0
and
K
%
align
==
0
return
align
<=
M
and
N
%
align
==
0
and
K
%
align
==
0
...
@@ -51,9 +51,26 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
...
@@ -51,9 +51,26 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
M
=
hidden_states
.
size
(
0
)
M
=
hidden_states
.
size
(
0
)
_
,
K
,
N
=
w2
.
size
()
_
,
K
,
N
=
w2
.
size
()
align
=
deep_gemm_block_shape
()[
0
]
if
not
_valid_deep_gemm_shape
(
M
,
N
,
K
):
if
not
_valid_deep_gemm_shape
(
M
,
N
,
K
):
logger
.
debug_once
(
logger
.
debug_once
(
"DeepGemm disabled: unaligned problem size. M: %s, N: %s, K: %s"
,
"DeepGemm disabled due to unaligned problem size. "
"M: %s, N: %s, K: %s. M should >= align size "
"and N and K must be multiples of %s."
"This is not an error and we will fall back to triton."
,
M
,
N
,
K
,
align
,
)
return
False
elif
N
<=
512
:
logger
.
debug_once
(
"DeepGemm disabled for N <= 512. M: %s, N: %s, K: %s. "
"This means we will fallback to triton "
"for this specific shape for further speed up."
,
M
,
M
,
N
,
N
,
K
,
K
,
...
...
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
d7b28f34
...
@@ -1360,10 +1360,8 @@ def fused_experts(
...
@@ -1360,10 +1360,8 @@ def fused_experts(
# E8M0 scale, which means we requantize the weight and input to the specific
# E8M0 scale, which means we requantize the weight and input to the specific
# scale. Fallen back to cutlass or triton for some cases would cause
# scale. Fallen back to cutlass or triton for some cases would cause
# accuracy issue.
# accuracy issue.
N
=
w1
.
size
(
1
)
should_use_deep_gemm
=
is_blackwell_deep_gemm_used
()
or
_valid_deep_gemm
(
should_use_deep_gemm
=
((
N
>
512
hidden_states
,
w1
,
w2
)
and
_valid_deep_gemm
(
hidden_states
,
w1
,
w2
))
or
is_blackwell_deep_gemm_used
())
if
(
allow_deep_gemm
and
use_fp8_w8a8
and
should_use_deep_gemm
):
if
(
allow_deep_gemm
and
use_fp8_w8a8
and
should_use_deep_gemm
):
assert
apply_router_weight_on_input
is
False
assert
apply_router_weight_on_input
is
False
assert
is_act_and_mul
,
(
assert
is_act_and_mul
,
(
...
...
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
View file @
d7b28f34
...
@@ -107,8 +107,8 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -107,8 +107,8 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
# Note: the deep gemm workspaces are strictly larger than the triton
# Note: the deep gemm workspaces are strictly larger than the triton
# workspaces so we can be pessimistic here and allocate for DeepGemm
# workspaces so we can be pessimistic here and allocate for DeepGemm
# even if we fall back to triton later, e.g. if expert maps are set.
# even if we fall back to triton later, e.g. if expert maps are set.
if
self
.
allow_deep_gemm
and
(
_valid_deep_gemm_shape
(
M
,
N
,
K
)
if
self
.
allow_deep_gemm
and
(
is_blackwell_deep_gemm_used
(
)
or
is_blackwell_deep_gemm_used
(
)):
or
_valid_deep_gemm_shape
(
M
,
N
,
K
)):
assert
self
.
deep_gemm_expert
is
not
None
assert
self
.
deep_gemm_expert
is
not
None
return
self
.
deep_gemm_expert
.
workspace_shapes
(
return
self
.
deep_gemm_expert
.
workspace_shapes
(
a
,
aq
,
M
,
N
,
K
,
topk
,
global_num_experts
,
local_num_experts
,
a
,
aq
,
M
,
N
,
K
,
topk
,
global_num_experts
,
local_num_experts
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment