Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c757a15f
Unverified
Commit
c757a15f
authored
Nov 06, 2025
by
xiangze-arm
Committed by
GitHub
Nov 06, 2025
Browse files
[CPU]Improve cpu fused moe perf (#27244)
Signed-off-by:
Zhang Xiangze
<
Xiangze.Zhang@arm.com
>
parent
59a50afa
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
40 additions
and
12 deletions
+40
-12
vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+40
-12
No files found.
vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
View file @
c757a15f
...
...
@@ -5,6 +5,7 @@ from collections.abc import Callable
import
torch
from
torch.nn
import
functional
as
F
from
vllm
import
_custom_ops
as
ops
from
vllm
import
envs
...
...
@@ -237,7 +238,43 @@ class SGLFusedMOE:
class
CPUFusedMOE
:
def
__init__
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
pass
use_onednn_mm
=
ops
.
_supports_onednn
and
ops
.
is_onednn_acl_supported
()
num_experts
=
layer
.
w13_weight
.
size
(
0
)
has_w13_bias
=
hasattr
(
layer
,
"w13_bias"
)
has_w2_bias
=
hasattr
(
layer
,
"w2_bias"
)
layer
.
gate_up_linear
=
[]
layer
.
down_linear
=
[]
for
i
in
range
(
num_experts
):
layer_w13_weight
=
layer
.
w13_weight
[
i
]
layer_w13_bias
=
layer
.
w13_bias
[
i
]
if
has_w13_bias
else
None
layer_w2_weight
=
layer
.
w2_weight
[
i
]
layer_w2_bias
=
layer
.
w2_bias
[
i
]
if
has_w2_bias
else
None
if
use_onednn_mm
:
gate_up_handle
=
ops
.
create_onednn_mm
(
layer_w13_weight
.
t
(),
32
)
layer
.
gate_up_linear
.
append
(
lambda
x
,
handle
=
gate_up_handle
,
bias
=
layer_w13_bias
:
ops
.
onednn_mm
(
handle
,
x
,
bias
)
)
down_handle
=
ops
.
create_onednn_mm
(
layer_w2_weight
.
t
(),
32
)
layer
.
down_linear
.
append
(
lambda
x
,
handle
=
down_handle
,
bias
=
layer_w2_bias
:
ops
.
onednn_mm
(
handle
,
x
,
bias
)
)
else
:
layer
.
gate_up_linear
.
append
(
lambda
x
,
w
=
layer_w13_weight
,
b
=
layer_w13_bias
:
F
.
linear
(
x
,
w
,
b
)
)
layer
.
down_linear
.
append
(
lambda
x
,
w
=
layer_w2_weight
,
b
=
layer_w2_bias
:
F
.
linear
(
x
,
w
,
b
)
)
if
use_onednn_mm
:
# remove weight
layer
.
w13_weight
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
0
),
requires_grad
=
False
)
layer
.
w2_weight
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
0
),
requires_grad
=
False
)
def
__call__
(
self
,
...
...
@@ -287,8 +324,6 @@ class CPUFusedMOE:
outputs
=
[]
start_idx
=
0
has_w13_bias
=
hasattr
(
layer
,
"w13_bias"
)
has_w2_bias
=
hasattr
(
layer
,
"w2_bias"
)
for
i
,
num_tokens
in
enumerate
(
tokens_per_expert
):
end_idx
=
start_idx
+
num_tokens
...
...
@@ -296,19 +331,12 @@ class CPUFusedMOE:
continue
tokens_for_this_expert
=
sorted_tokens
[
start_idx
:
end_idx
]
layer_w13_weight
=
layer
.
w13_weight
[
i
]
layer_w13_bias
=
layer
.
w13_bias
[
i
]
if
has_w13_bias
else
None
layer_w2_weight
=
layer
.
w2_weight
[
i
]
layer_w2_bias
=
layer
.
w2_bias
[
i
]
if
has_w2_bias
else
None
gate_up
=
F
.
linear
(
tokens_for_this_expert
,
layer_w13_weight
,
bias
=
layer_w13_bias
)
gate_up
=
layer
.
gate_up_linear
[
i
](
tokens_for_this_expert
)
if
activation
==
"swigluoai"
:
gate_up
=
swigluoai_and_mul
(
gate_up
)
else
:
gate_up
=
silu_and_mul
(
gate_up
)
expert_out
=
F
.
linear
(
gate_up
,
layer_w2_weight
,
bias
=
layer_w2_bias
)
expert_out
=
layer
.
down_
linear
[
i
]
(
gate_up
)
outputs
.
append
(
expert_out
)
start_idx
=
end_idx
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment