Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fa6e599a
Unverified
Commit
fa6e599a
authored
Jan 23, 2026
by
Karan Bansal
Committed by
GitHub
Jan 23, 2026
Browse files
[Bugfix] Fix _CPU_MOE_ACT AssertionError when vLLM config not set (#32777)
Signed-off-by:
Karan Bansal
<
karanb192@gmail.com
>
parent
7ef58737
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
26 additions
and
26 deletions
+26
-26
tests/kernels/moe/test_cpu_fused_moe.py
tests/kernels/moe/test_cpu_fused_moe.py
+2
-7
vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+24
-19
No files found.
tests/kernels/moe/test_cpu_fused_moe.py
View file @
fa6e599a
...
...
@@ -6,7 +6,7 @@ import torch
from
tests.kernels.allclose_default
import
get_default_atol
,
get_default_rtol
from
vllm._custom_ops
import
cpu_fused_moe
,
cpu_prepack_moe_weight
from
vllm.model_executor.layers.fused_moe.cpu_fused_moe
import
_CPU_MOE_ACT
from
vllm.model_executor.layers.fused_moe.cpu_fused_moe
import
_CPU_MOE_ACT
_FN
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
set_random_seed
...
...
@@ -68,12 +68,7 @@ def ref_fused_moe(
tokens_for_this_expert
,
curr_w13
,
curr_w13_bias
)
# Note: to simulate the kernel implementation
gate_up
=
(
_CPU_MOE_ACT
[
activation
]
.
forward_native
(
gate_up
)
.
to
(
dtype
=
input
.
dtype
)
.
float
()
)
gate_up
=
_CPU_MOE_ACT_FN
[
activation
](
gate_up
).
to
(
dtype
=
input
.
dtype
).
float
()
expert_out
=
torch
.
nn
.
functional
.
linear
(
gate_up
,
curr_w2
,
curr_w2_bias
)
outputs
.
append
(
expert_out
)
...
...
vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
View file @
fa6e599a
...
...
@@ -8,33 +8,38 @@ from torch.nn import functional as F
from
vllm
import
_custom_ops
as
ops
from
vllm._custom_ops
import
cpu_fused_moe
,
cpu_prepack_moe_weight
from
vllm.model_executor.layers.activation
import
SiluAndMul
,
SwigluOAIAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.quantization.utils.layer_utils
import
replace_parameter
from
vllm.utils.torch_utils
import
direct_register_custom_op
_CPU_MOE_LAYER_CACHE
=
{}
class
_LazyActivationDict
(
dict
):
"""Lazily instantiate activation functions on first access.
def
_swigluoai_forward_native
(
x
:
torch
.
Tensor
,
alpha
:
float
=
1.702
,
limit
:
float
=
7.0
,
)
->
torch
.
Tensor
:
"""PyTorch-native implementation of SwigluOAIAndMul.forward_native.
Avoids triggering CustomOp.__init__() at module import time,
which would
call
get_current_vllm_config() before config is set.
Standalone function to avoid instantiating SwigluOAIAndMul (a CustomOp)
which would
trigger
get_current_vllm_config() before config is set.
"""
gate
,
up
=
x
[...,
::
2
],
x
[...,
1
::
2
]
gate
=
gate
.
clamp
(
min
=
None
,
max
=
limit
)
up
=
up
.
clamp
(
min
=-
limit
,
max
=
limit
)
glu
=
gate
*
torch
.
sigmoid
(
gate
*
alpha
)
gated_output
=
(
up
+
1
)
*
glu
return
gated_output
_factories
:
dict
[
str
,
type
[
SiluAndMul
]
|
type
[
SwigluOAIAndMul
]]
=
{
"silu"
:
SiluAndMul
,
"swigluoai"
:
SwigluOAIAndMul
,
}
def
__missing__
(
self
,
key
:
str
)
->
SiluAndMul
|
SwigluOAIAndMul
:
if
key
not
in
self
.
_factories
:
raise
KeyError
(
f
"
{
key
}
is not a supported activation"
)
self
[
key
]
=
self
.
_factories
[
key
]()
return
self
[
key
]
_CPU_MOE_ACT
=
_LazyActivationDict
()
# Map activation names to their native forward functions.
# Uses static methods or standalone functions to avoid instantiating CustomOp
# classes, which would call get_current_vllm_config() before config is set.
_CPU_MOE_ACT_FN
:
dict
[
str
,
Callable
[[
torch
.
Tensor
],
torch
.
Tensor
]]
=
{
"silu"
:
SiluAndMul
.
forward_native
,
"swigluoai"
:
_swigluoai_forward_native
,
}
def
grouped_topk
(
...
...
@@ -230,7 +235,7 @@ class CPUFusedMOE:
apply_router_weight_on_input
:
bool
=
False
,
activation
:
str
=
"silu"
,
)
->
torch
.
Tensor
:
assert
activation
in
_CPU_MOE_ACT
.
_factories
,
f
"
{
activation
}
is not supported."
assert
activation
in
_CPU_MOE_ACT
_FN
,
f
"
{
activation
}
is not supported."
assert
not
apply_router_weight_on_input
topk_weights
,
topk_ids
=
select_experts
(
...
...
@@ -418,7 +423,7 @@ def cpu_fused_moe_torch(
tokens_for_this_expert
=
sorted_tokens
[
start_idx
:
end_idx
]
gate_up
=
layer
.
gate_up_linear
[
i
](
tokens_for_this_expert
)
# type: ignore
gate_up
=
_CPU_MOE_ACT
[
activation
]
.
forward_native
(
gate_up
)
gate_up
=
_CPU_MOE_ACT
_FN
[
activation
](
gate_up
)
expert_out
=
layer
.
down_linear
[
i
](
gate_up
)
# type: ignore
outputs
.
append
(
expert_out
)
start_idx
=
end_idx
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment